From 1b1658558b1a4103387a112fb0f04367ee611af9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BA=D1=81=D0=B5=D0=BB=D1=8C=D1=80=D0=BE=D0=B4=20?= =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B9=20=D0=9B=D0=B5=D0=BE=D0=BD?= =?UTF-8?q?=D0=B8=D0=B4=D0=BE=D0=B2=D0=B8=D1=87?= Date: Sun, 19 Jan 2025 22:58:26 +0300 Subject: [PATCH 1/2] refactor(np_can_hold_element): reduce complexity by splitting logic --- pandas/core/dtypes/cast.py | 391 ++++++++++++++++++++++++------------- 1 file changed, 253 insertions(+), 138 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 02b9291da9b31..1f4d84b8dc118 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1767,180 +1767,294 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: return False -def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: +def _handle_integer_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: """ - Raise if we cannot losslessly set this element into an ndarray with this dtype. + Handles casting or validation of an element for integer dtypes. - Specifically about places where we disagree with numpy. i.e. there are - cases where numpy will raise in doing the setitem that we do not check - for here, e.g. setting str "X" into a numeric ndarray. + Parameters + ---------- + dtype : np.dtype + Target numpy integer dtype. + element : Any + Element to be checked or casted. + tipo : np.dtype or None + Inferred dtype of the element, if available. Returns ------- Any - The element, potentially cast to the dtype. + The element, potentially cast to the target dtype. Raises ------ - ValueError : If we cannot losslessly store this element with this dtype. + LossySetitemError: If the element cannot be losslessly stored in the given integer dtype. """ - if dtype == _dtype_obj: - return element + if isinstance(element, range): + if _dtype_can_hold_range(element, dtype): + return element + raise LossySetitemError - tipo = _maybe_infer_dtype_type(element) + if is_integer(element) or (is_float(element) and element.is_integer()): + # e.g. test_setitem_series_int8 if we have a python int 1 + # tipo may be np.int32, despite the fact that it will fit + # in smaller int dtypes. + info = np.iinfo(dtype) + if info.min <= element <= info.max: + return dtype.type(element) + raise LossySetitemError - if dtype.kind in "iu": - if isinstance(element, range): - if _dtype_can_hold_range(element, dtype): - return element - raise LossySetitemError + if tipo is None: + raise LossySetitemError - if is_integer(element) or (is_float(element) and element.is_integer()): - # e.g. test_setitem_series_int8 if we have a python int 1 - # tipo may be np.int32, despite the fact that it will fit - # in smaller int dtypes. - info = np.iinfo(dtype) - if info.min <= element <= info.max: - return dtype.type(element) + if tipo.kind not in "iu": + if isinstance(element, np.ndarray) and element.dtype.kind == "f": + # If all can be losslessly cast to integers, then we can hold them + with np.errstate(invalid="ignore"): + # We check afterwards if cast was losslessly, so no need to show + # the warning + casted = element.astype(dtype) + comp = casted == element + if comp.all(): + # Return the casted values bc they can be passed to + # np.putmask, whereas the raw values cannot. + # see TestSetitemFloatNDarrayIntoIntegerSeries + return casted raise LossySetitemError - if tipo is not None: - if tipo.kind not in "iu": - if isinstance(element, np.ndarray) and element.dtype.kind == "f": - # If all can be losslessly cast to integers, then we can hold them - with np.errstate(invalid="ignore"): - # We check afterwards if cast was losslessly, so no need to show - # the warning - casted = element.astype(dtype) - comp = casted == element - if comp.all(): - # Return the casted values bc they can be passed to - # np.putmask, whereas the raw values cannot. - # see TestSetitemFloatNDarrayIntoIntegerSeries - return casted - raise LossySetitemError - - elif isinstance(element, ABCExtensionArray) and isinstance( - element.dtype, CategoricalDtype - ): - # GH#52927 setting Categorical value into non-EA frame - # TODO: general-case for EAs? - try: - casted = element.astype(dtype) - except (ValueError, TypeError) as err: - raise LossySetitemError from err - # Check for cases of either - # a) lossy overflow/rounding or - # b) semantic changes like dt64->int64 - comp = casted == element - if not comp.all(): - raise LossySetitemError - return casted - - # Anything other than integer we cannot hold - raise LossySetitemError - if ( - dtype.kind == "u" - and isinstance(element, np.ndarray) - and element.dtype.kind == "i" - ): - # see test_where_uint64 + elif isinstance(element, ABCExtensionArray) and isinstance( + element.dtype, CategoricalDtype + ): + # GH#52927 setting Categorical value into non-EA frame + # TODO: general-case for EAs? + try: casted = element.astype(dtype) - if (casted == element).all(): - # TODO: faster to check (element >=0).all()? potential - # itemsize issues there? - return casted + except (ValueError, TypeError) as err: + raise LossySetitemError from err + # Check for cases of either + # a) lossy overflow/rounding or + # b) semantic changes like dt64->int64 + comp = casted == element + if not comp.all(): raise LossySetitemError - if dtype.itemsize < tipo.itemsize: - raise LossySetitemError - if not isinstance(tipo, np.dtype): - # i.e. nullable IntegerDtype; we can put this into an ndarray - # losslessly iff it has no NAs - arr = element._values if isinstance(element, ABCSeries) else element - if arr._hasna: - raise LossySetitemError - return element - - return element + return casted + # Anything other than integer we cannot hold raise LossySetitemError - if dtype.kind == "f": - if lib.is_integer(element) or lib.is_float(element): - casted = dtype.type(element) - if np.isnan(casted) or casted == element: - return casted - # otherwise e.g. overflow see TestCoercionFloat32 + if ( + dtype.kind == "u" + and isinstance(element, np.ndarray) + and element.dtype.kind == "i" + ): + # see test_where_uint64 + casted = element.astype(dtype) + if (casted == element).all(): + # TODO: faster to check (element >=0).all()? potential + # itemsize issues there? + return casted + raise LossySetitemError + if dtype.itemsize < tipo.itemsize: + raise LossySetitemError + if not isinstance(tipo, np.dtype): + # i.e. nullable IntegerDtype; we can put this into an ndarray + # losslessly iff it has no NAs + arr = element._values if isinstance(element, ABCSeries) else element + if arr._hasna: raise LossySetitemError + return element + return element - if tipo is not None: - # TODO: itemsize check? - if tipo.kind not in "iuf": - # Anything other than float/integer we cannot hold - raise LossySetitemError - if not isinstance(tipo, np.dtype): - # i.e. nullable IntegerDtype or FloatingDtype; - # we can put this into an ndarray losslessly iff it has no NAs - if element._hasna: - raise LossySetitemError - return element - elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind: - if isinstance(element, np.ndarray): - # e.g. TestDataFrameIndexingWhere::test_where_alignment - casted = element.astype(dtype) - if np.array_equal(casted, element, equal_nan=True): - return casted - raise LossySetitemError +def _handle_float_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: + """ + Handles casting or validation of an element for floating-point dtypes. - return element + Parameters + ---------- + dtype : np.dtype + Target numpy floating-point dtype. + element : Any + Element to be checked or casted. + tipo : np.dtype or None + Inferred dtype of the element, if available. + Returns + ------- + Any + The element, potentially cast to the target dtype. + + Raises + ------ + LossySetitemError: If the element cannot be losslessly stored in the given float dtype. + """ + if lib.is_integer(element) or lib.is_float(element): + casted = dtype.type(element) + if np.isnan(casted) or casted == element: + return casted + # otherwise e.g. overflow see TestCoercionFloat32 raise LossySetitemError - if dtype.kind == "c": - if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element): - if np.isnan(element): - # see test_where_complex GH#6345 - return dtype.type(element) + if tipo is None: + raise LossySetitemError - with warnings.catch_warnings(): - warnings.filterwarnings("ignore") - casted = dtype.type(element) - if casted == element: + # TODO: itemsize check? + if tipo.kind not in "iuf": + # Anything other than float/integer we cannot hold + raise LossySetitemError + if not isinstance(tipo, np.dtype): + # i.e. nullable IntegerDtype or FloatingDtype; + # we can put this into an ndarray losslessly iff it has no NAs + if element._hasna: + raise LossySetitemError + return element + elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind: + if isinstance(element, np.ndarray): + # e.g. TestDataFrameIndexingWhere::test_where_alignment + casted = element.astype(dtype) + if np.array_equal(casted, element, equal_nan=True): return casted - # otherwise e.g. overflow see test_32878_complex_itemsize raise LossySetitemError - if tipo is not None: - if tipo.kind in "iufc": - return element - raise LossySetitemError + return element + +def _handle_complex_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: + """ + Handles casting or validation of an element for complex dtypes. + + Parameters + ---------- + dtype : np.dtype + Target numpy complex dtype. + element : Any + Element to be checked or casted. + tipo : np.dtype or None + Inferred dtype of the element, if available. + + Returns + ------- + Any + The element, potentially cast to the target dtype. + + Raises + ------ + LossySetitemError: If the element cannot be losslessly stored in the given complex dtype. + """ + if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element): + if np.isnan(element): + # see test_where_complex GH#6345 + return dtype.type(element) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + casted = dtype.type(element) + if casted == element: + return casted + # otherwise e.g. overflow see test_32878_complex_itemsize + raise LossySetitemError + if tipo is None: raise LossySetitemError + if tipo.kind in "iufc": + return element + raise LossySetitemError - if dtype.kind == "b": - if tipo is not None: - if tipo.kind == "b": - if not isinstance(tipo, np.dtype): - # i.e. we have a BooleanArray - if element._hasna: - # i.e. there are pd.NA elements - raise LossySetitemError - return element - raise LossySetitemError - if lib.is_bool(element): - return element +def _handle_boolean_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: + """ + Handles casting or validation of an element for boolean dtypes. + + Parameters + ---------- + dtype : np.dtype + Target numpy boolean dtype. + element : Any + Element to be checked or casted. + tipo : np.dtype or None + Inferred dtype of the element, if available. + + Returns + ------- + Any + The element, potentially cast to the target dtype. + + Raises + ------ + LossySetitemError: If the element cannot be losslessly stored in the given boolean dtype. + """ + if lib.is_bool(element): + return element + if tipo is None: raise LossySetitemError + if tipo.kind == "b": + if not isinstance(tipo, np.dtype): + # i.e. we have a BooleanArray + if element._hasna: + # i.e. there are pd.NA elements + raise LossySetitemError + return element + raise LossySetitemError - if dtype.kind == "S": - # TODO: test tests.frame.methods.test_replace tests get here, - # need more targeted tests. xref phofl has a PR about this - if tipo is not None: - if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize: - return element - raise LossySetitemError - if isinstance(element, bytes) and len(element) <= dtype.itemsize: - return element +def _handle_string_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: + """ + Handles casting or validation of an element for string (byte) dtypes. + + Parameters + ---------- + dtype : np.dtype + Target numpy string dtype (e.g., 'S'). + element : Any + Element to be checked or casted. + tipo : np.dtype or None + Inferred dtype of the element, if available. + + Returns + ------- + Any + The element, potentially cast to the target dtype. + + Raises + ------ + LossySetitemError: If the element cannot be losslessly stored in the given string dtype. + """ + # TODO: test tests.frame.methods.test_replace tests get here, + # need more targeted tests. xref phofl has a PR about this + if isinstance(element, bytes) and len(element) <= dtype.itemsize: + return element + if tipo is None: raise LossySetitemError + if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize: + return element + raise LossySetitemError +def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: + """ + Raise if we cannot losslessly set this element into an ndarray with this dtype. + + Specifically about places where we disagree with numpy. i.e. there are + cases where numpy will raise in doing the setitem that we do not check + for here, e.g. setting str "X" into a numeric ndarray. + + Returns + ------- + Any + The element, potentially cast to the dtype. + + Raises + ------ + ValueError : If we cannot losslessly store this element with this dtype. + """ + if dtype == _dtype_obj: + return element + + tipo = _maybe_infer_dtype_type(element) + + if dtype.kind in "iu": + return _handle_integer_dtype(dtype, element, tipo) + if dtype.kind == "f": + return _handle_float_dtype(dtype, element, tipo) + if dtype.kind == "c": + return _handle_complex_dtype(dtype, element, tipo) + if dtype.kind == "b": + return _handle_boolean_dtype(dtype, element, tipo) + if dtype.kind == "S": + return _handle_string_dtype(dtype, element, tipo) if dtype.kind == "V": # i.e. np.void, which cannot hold _anything_ raise LossySetitemError @@ -1948,6 +2062,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: raise NotImplementedError(dtype) + def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: """ _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints), From 74845ea90c27e75f52d55f687914e967d0a629ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BA=D1=81=D0=B5=D0=BB=D1=8C=D1=80=D0=BE=D0=B4=20?= =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B9=20=D0=9B=D0=B5=D0=BE=D0=BD?= =?UTF-8?q?=D0=B8=D0=B4=D0=BE=D0=B2=D0=B8=D1=87?= Date: Sun, 19 Jan 2025 23:30:03 +0300 Subject: [PATCH 2/2] style: fix flake8 issues in np_can_hold_element refactor --- pandas/core/dtypes/cast.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1f4d84b8dc118..05679408bd9e3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1787,7 +1787,8 @@ def _handle_integer_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: Raises ------ - LossySetitemError: If the element cannot be losslessly stored in the given integer dtype. + LossySetitemError: If the element cannot be + losslessly stored in the given integer dtype. """ if isinstance(element, range): if _dtype_can_hold_range(element, dtype): @@ -1822,7 +1823,7 @@ def _handle_integer_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: raise LossySetitemError elif isinstance(element, ABCExtensionArray) and isinstance( - element.dtype, CategoricalDtype + element.dtype, CategoricalDtype ): # GH#52927 setting Categorical value into non-EA frame # TODO: general-case for EAs? @@ -1842,9 +1843,9 @@ def _handle_integer_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: raise LossySetitemError if ( - dtype.kind == "u" - and isinstance(element, np.ndarray) - and element.dtype.kind == "i" + dtype.kind == "u" + and isinstance(element, np.ndarray) + and element.dtype.kind == "i" ): # see test_where_uint64 casted = element.astype(dtype) @@ -1864,6 +1865,7 @@ def _handle_integer_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: return element return element + def _handle_float_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: """ Handles casting or validation of an element for floating-point dtypes. @@ -1884,7 +1886,8 @@ def _handle_float_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: Raises ------ - LossySetitemError: If the element cannot be losslessly stored in the given float dtype. + LossySetitemError: If the element cannot be losslessly + stored in the given float dtype. """ if lib.is_integer(element) or lib.is_float(element): casted = dtype.type(element) @@ -1916,6 +1919,7 @@ def _handle_float_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: return element + def _handle_complex_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: """ Handles casting or validation of an element for complex dtypes. @@ -1936,7 +1940,8 @@ def _handle_complex_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: Raises ------ - LossySetitemError: If the element cannot be losslessly stored in the given complex dtype. + LossySetitemError: If the element cannot be + losslessly stored in the given complex dtype. """ if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element): if np.isnan(element): @@ -1956,6 +1961,7 @@ def _handle_complex_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: return element raise LossySetitemError + def _handle_boolean_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: """ Handles casting or validation of an element for boolean dtypes. @@ -1976,7 +1982,8 @@ def _handle_boolean_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: Raises ------ - LossySetitemError: If the element cannot be losslessly stored in the given boolean dtype. + LossySetitemError: If the element cannot be + losslessly stored in the given boolean dtype. """ if lib.is_bool(element): return element @@ -1991,6 +1998,7 @@ def _handle_boolean_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: return element raise LossySetitemError + def _handle_string_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: """ Handles casting or validation of an element for string (byte) dtypes. @@ -2011,7 +2019,8 @@ def _handle_string_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: Raises ------ - LossySetitemError: If the element cannot be losslessly stored in the given string dtype. + LossySetitemError: If the element cannot be + losslessly stored in the given string dtype. """ # TODO: test tests.frame.methods.test_replace tests get here, # need more targeted tests. xref phofl has a PR about this @@ -2023,6 +2032,7 @@ def _handle_string_dtype(dtype: np.dtype, element: Any, tipo: np.dtype) -> Any: return element raise LossySetitemError + def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: """ Raise if we cannot losslessly set this element into an ndarray with this dtype. @@ -2062,7 +2072,6 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: raise NotImplementedError(dtype) - def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: """ _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints),