From 53a631940c161dbb3c08a0db2b22a7ced812c66e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 11:11:03 -0800 Subject: [PATCH 1/9] BUG: Return numpy types from ArrowExtensionArray.to_numpy for temporal types when possible --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/arrays/arrow/array.py | 11 +++++--- pandas/tests/extension/test_arrow.py | 40 ++++++++++++++-------------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 25afcbb3bb532..e277fbb7a8a10 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) -- +- Bug in :class:`arrays.ArrowExtensionArray.to_numpy` where ``timestamp`` and ``duration`` types with ``dtype=None`` did not return ``datetime64`` and ``timedelta64`` types respectively (:issue:`55997`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4bcc03643dac8..a62c51b9f4bb2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1262,10 +1262,15 @@ def to_numpy( if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): result = data._maybe_convert_datelike_array() - if dtype is None or dtype.kind == "O": - result = result.to_numpy(dtype=object, na_value=na_value) + if (pa.types.is_timestamp(pa_type) and pa_type.tz is not None) or ( + dtype is not None and dtype.kind == "O" + ): + dtype = object else: - result = result.to_numpy(dtype=dtype) + # GH 55997 + dtype = None + na_value = pa_type.to_pandas_dtype().type("nat", pa_type.unit) + result = result.to_numpy(dtype=dtype, na_value=na_value) elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): # convert to list of python datetime.time objects before # wrapping in ndarray diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index adbc4a1bb729b..fef93108cc5a9 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1471,11 +1471,9 @@ def test_to_numpy_with_defaults(data): result = data.to_numpy() pa_type = data._pa_array.type - if ( - pa.types.is_duration(pa_type) - or pa.types.is_timestamp(pa_type) - or pa.types.is_date(pa_type) - ): + if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): + pytest.skip("Tested in test_to_numpy_temporal") + elif pa.types.is_date(pa_type): expected = np.array(list(data)) else: expected = np.array(data._pa_array) @@ -2846,26 +2844,28 @@ def test_groupby_series_size_returns_pa_int(data): @pytest.mark.parametrize( - "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES, ids=repr ) -def test_to_numpy_temporal(pa_type): +@pytest.mark.parametrize("dtype", [None, object]) +def test_to_numpy_temporal(pa_type, dtype): # GH 53326 + # GH 55997: Return datetime64/timedelta64 types with NaT if possible arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) - result = arr.to_numpy() + result = arr.to_numpy(dtype=dtype) if pa.types.is_duration(pa_type): - expected = [ - pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit), - pd.NA, - ] - assert isinstance(result[0], pd.Timedelta) + value = pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit) else: - expected = [ - pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit), - pd.NA, - ] - assert isinstance(result[0], pd.Timestamp) - expected = np.array(expected, dtype=object) - assert result[0].unit == expected[0].unit + value = pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit) + + if dtype == object or (pa.types.is_timestamp(pa_type) and pa_type.tz is not None): + na = pd.NA + expected = np.array([value, na], dtype=object) + assert result[0].unit == value.unit + else: + na = pa_type.to_pandas_dtype().type("nat", pa_type.unit) + value = value.to_numpy() + expected = np.array([value, na]) + assert np.datetime_data(result[0])[0] == pa_type.unit tm.assert_numpy_array_equal(result, expected) From 1b301727711e3edfcaedecbf0b5da8ddb3501f15 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Nov 2023 13:45:11 -0800 Subject: [PATCH 2/9] Fix repr and comp issues --- pandas/core/arrays/arrow/array.py | 9 ++++++++- pandas/io/formats/format.py | 2 +- pandas/tests/extension/test_arrow.py | 13 +++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a62c51b9f4bb2..6d648e51a1b5b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -642,7 +642,14 @@ def _cmp_method(self, other, op): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") - result[valid] = op(np.array(self)[valid], other) + try: + result[valid] = op(np.array(self)[valid], other) + except TypeError: + # Invalid comparison from numpy + if op.__name__ == "ne": + result = ~result + else: + raise result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index bb2fd06d98e1d..cc83e51fb2bd6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1528,7 +1528,7 @@ def _format_strings(self) -> list[str]: # Categorical is special for now, so that we can preserve tzinfo array = values._internal_get_values() else: - array = np.asarray(values) + array = np.asarray(values, dtype=object) fmt_values = format_array( array, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fef93108cc5a9..ac712abf11d65 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -265,6 +265,19 @@ def data_for_twos(data): class TestArrowArray(base.ExtensionTests): + def test_compare_scalar(self, data, comparison_op): + ser = pd.Series(data) + self._compare_other(ser, data, comparison_op, data[0]) + + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data_missing, na_action): + if data_missing.dtype.kind in "mM": + result = data_missing.map(lambda x: x, na_action=na_action) + expected = data_missing.to_numpy(dtype=object) + tm.assert_numpy_array_equal(result, expected) + else: + super().test_map(data_missing, na_action) + def test_astype_str(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): From 9ca90c6559152a17a3c32805f782fea655561178 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Dec 2023 17:18:35 -0800 Subject: [PATCH 3/9] Remove instance check --- pandas/io/formats/format.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 769d493278d90..cc83e51fb2bd6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -61,7 +61,6 @@ ) from pandas.core.arrays import ( - BaseMaskedArray, Categorical, DatetimeArray, ExtensionArray, @@ -1528,8 +1527,6 @@ def _format_strings(self) -> list[str]: if isinstance(values, Categorical): # Categorical is special for now, so that we can preserve tzinfo array = values._internal_get_values() - elif isinstance(values, BaseMaskedArray): - array = values.to_numpy(dtype=object) else: array = np.asarray(values, dtype=object) From 1bdacb2e65b7508b64d3f38b4603445f14c8a7bd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Dec 2023 11:10:53 -0800 Subject: [PATCH 4/9] Address test failures --- pandas/tests/extension/test_arrow.py | 8 ++++++++ pandas/tests/io/formats/test_format.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0a52dfd64defd..bd627ae48813a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -287,6 +287,14 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) + elif ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None + ) or pa.types.is_duration(pa_dtype): + request.applymarker( + pytest.mark.xfail( + reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", + ) + ) super().test_astype_str(data) def test_from_dtype(self, data, request): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f5738b83a8e64..4748a2a4aa00e 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1921,7 +1921,7 @@ def dtype(self): series = Series(ExtTypeStub(), copy=False) res = repr(series) # This line crashed before #33770 was fixed. expected = "\n".join( - ["0 [False True]", "1 [ True False]", "dtype: DtypeStub"] + ["0 [False True]", "1 [True False]", "dtype: DtypeStub"] ) assert res == expected From 07e5fabd68fea2fa4dedae406f915b4f3e0f0d1e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 11:48:02 -0800 Subject: [PATCH 5/9] Cast to object to avoid numpy bug, fix another test --- pandas/core/arrays/arrow/array.py | 11 ++++++++++- pandas/tests/extension/test_arrow.py | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8e4c53396eb4f..170a5695b2bcb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1137,7 +1137,16 @@ def searchsorted( if isinstance(value, ExtensionArray): value = value.astype(object) # Base class searchsorted would cast to object, which is *much* slower. - return self.to_numpy().searchsorted(value, side=side, sorter=sorter) + pa_dtype = self.dtype.pyarrow_dtype + if ( + pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype) + ) and pa_dtype.unit == "ns": + # np.array[datetime/timedelta].searchsorted(datetime/timedelta) + # erroneously fails when numpy type resolution is nanoseconds + dtype = object + else: + dtype = None + return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter) def take( self, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6c309c0f859ee..81d0c23c18a0c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -41,6 +41,7 @@ pa_version_under13p0, pa_version_under14p0, ) +import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -297,6 +298,25 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string[python]", + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + ], + ) + def test_astype_string(self, data, nullable_string_dtype, request): + pa_dtype = data.dtype.pyarrow_dtype + if ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None + ) or pa.types.is_duration(pa_dtype): + request.applymarker( + pytest.mark.xfail( + reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", + ) + ) + super().test_astype_string(data, nullable_string_dtype) + def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): From db205b3ce35a4f485fbd58792494b98d494e8722 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 11:54:25 -0800 Subject: [PATCH 6/9] Use invalid_comparison --- pandas/core/arrays/arrow/array.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 170a5695b2bcb..edcf85e89f2d4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -47,6 +47,7 @@ from pandas.core import ( algorithms as algos, missing, + ops, roperator, ) from pandas.core.arraylike import OpsMixin @@ -655,14 +656,11 @@ def _cmp_method(self, other, op): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") + np_array = np.array(self) try: - result[valid] = op(np.array(self)[valid], other) + result[valid] = op(np_array[valid], other) except TypeError: - # Invalid comparison from numpy - if op.__name__ == "ne": - result = ~result - else: - raise + result = ops.invalid_comparison(np_array, other, op) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: From 3bfb0d5bffaa4b80088d61565e3c249623e89ff8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 12:28:21 -0800 Subject: [PATCH 7/9] Only for ArrowDtype --- pandas/core/arrays/arrow/array.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index edcf85e89f2d4..2ca4fd78b0869 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1135,15 +1135,15 @@ def searchsorted( if isinstance(value, ExtensionArray): value = value.astype(object) # Base class searchsorted would cast to object, which is *much* slower. - pa_dtype = self.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype) - ) and pa_dtype.unit == "ns": - # np.array[datetime/timedelta].searchsorted(datetime/timedelta) - # erroneously fails when numpy type resolution is nanoseconds - dtype = object - else: - dtype = None + dtype = None + if isinstance(self.dtype, ArrowDtype): + pa_dtype = self.dtype.pyarrow_dtype + if ( + pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype) + ) and pa_dtype.unit == "ns": + # np.array[datetime/timedelta].searchsorted(datetime/timedelta) + # erroneously fails when numpy type resolution is nanoseconds + dtype = object return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter) def take( From 1204d1ec8408927c09bc38b518177e7853c59d71 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 12:29:53 -0800 Subject: [PATCH 8/9] adjust message --- pandas/tests/arrays/string_/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8dcda44aa68e5..0fe4c91a83435 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -263,7 +263,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): other = 42 if op_name not in ["__eq__", "__ne__"]: - with pytest.raises(TypeError, match="not supported between"): + with pytest.raises(TypeError, match="Invalid comparison|not supported between"): getattr(a, op_name)(other) return From 546d5e2aa7c7ba774df4b9c941adebed7f83f05f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 10:04:05 -0800 Subject: [PATCH 9/9] Move to 2.2 --- doc/source/whatsnew/v2.1.4.rst | 1 - doc/source/whatsnew/v2.2.0.rst | 12 ++++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 1264e841f75b2..041b8fc866edf 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -24,7 +24,6 @@ Bug fixes - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Bug in :class:`Series` when trying to cast date-like string inputs to :class:`ArrowDtype` of ``pyarrow.timestamp`` (:issue:`56266`) - Bug in :class:`Timestamp` construction with ``ts_input="now"`` or ``ts_input="today"`` giving a different unit from :meth:`Timestamp.now` or :meth:`Timestamp.today` (:issue:`55879`) -- Bug in :class:`arrays.ArrowExtensionArray.to_numpy` where ``timestamp`` and ``duration`` types with ``dtype=None`` did not return ``datetime64`` and ``timedelta64`` types respectively (:issue:`55997`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`.DataFrameGroupBy.min` and :meth:`.DataFrameGroupBy.max` not preserving extension dtype for empty object (:issue:`55619`) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index c878fd2664dc4..f24209175dca7 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -107,11 +107,11 @@ documentation. .. _whatsnew_220.enhancements.to_numpy_ea: -ExtensionArray.to_numpy converts to suitable NumPy dtype -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``to_numpy`` for NumPy nullable and Arrow types converts to suitable NumPy dtype +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`ExtensionArray.to_numpy` will now convert to a suitable NumPy dtype instead -of ``object`` dtype for nullable extension dtypes. +``to_numpy`` for NumPy nullable and Arrow types will now convert to a +suitable NumPy dtype instead of ``object`` dtype for nullable extension dtypes. *Old behavior:* @@ -128,6 +128,9 @@ of ``object`` dtype for nullable extension dtypes. ser = pd.Series([1, 2, 3], dtype="Int64") ser.to_numpy() + ser = pd.Series([1, 2, 3], dtype="timestamp[ns][pyarrow]") + ser.to_numpy() + The default NumPy dtype (without any arguments) is determined as follows: - float dtypes are cast to NumPy floats @@ -135,6 +138,7 @@ The default NumPy dtype (without any arguments) is determined as follows: - integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator - boolean dtypes without missing values are cast to NumPy bool dtype - boolean dtypes with missing values keep object dtype +- datetime and timedelta types are cast to Numpy datetime64 and timedelta64 types respectively and ``NaT`` is used as missing value indicator .. _whatsnew_220.enhancements.struct_accessor: