From 822ae79b6600d1f388a90c9c681ec2e82c5b478f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 11 Jan 2023 19:16:51 -0800 Subject: [PATCH 01/10] BUG: ArrowDtype.construct_from_string round-trip --- pandas/core/arrays/arrow/dtype.py | 39 ++++++++++++ pandas/tests/extension/base/dtype.py | 18 +++--- pandas/tests/extension/test_arrow.py | 91 ++++++---------------------- 3 files changed, 66 insertions(+), 82 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index f5f87bea83b8f..cebc920086b7c 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -148,6 +148,14 @@ def construct_from_string(cls, string: str) -> ArrowDtype: except ValueError as err: has_parameters = re.search(r"\[.*\]", base_type) if has_parameters: + + # Fallback to try common temporal types + try: + return cls._parse_temporal_dtype_string(base_type) + except (NotImplementedError, ValueError): + # Fall through to raise with nice exception message below + pass + raise NotImplementedError( "Passing pyarrow type specific parameters " f"({has_parameters.group()}) in the string is not supported. " @@ -157,6 +165,37 @@ def construct_from_string(cls, string: str) -> ArrowDtype: raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err return cls(pa_dtype) + @classmethod + def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: + """ + Construct a temporal ArrowDtype from string. + """ + # we assume + # 1) "[pyarrow]" has already been stripped from the end of our string. + # 2) we know "[" is present + head, tail = string.split("[", 1) + + if not tail.endswith("]"): + raise ValueError + tail = tail[:-1] + + if head == "timestamp": + if "," not in tail: + tz = None + unit = tail + else: + unit, tz = tail.split(",", 1) + unit = unit.strip() + tz = tz.strip() + if tz.startswith("tz="): + tz = tz[3:] + + pa_type = pa.timestamp(unit, tz=tz) + dtype = cls(pa_type) + return dtype + + raise NotImplementedError(string) + @property def _is_numeric(self) -> bool: """ diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 2635343d73fd7..392a75f8a69a7 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -20,14 +20,6 @@ def test_kind(self, dtype): valid = set("biufcmMOSUV") assert dtype.kind in valid - def test_construct_from_string_own_name(self, dtype): - result = dtype.construct_from_string(dtype.name) - assert type(result) is type(dtype) - - # check OK as classmethod - result = type(dtype).construct_from_string(dtype.name) - assert type(result) is type(dtype) - def test_is_dtype_from_name(self, dtype): result = type(dtype).is_dtype(dtype.name) assert result is True @@ -97,9 +89,13 @@ def test_eq(self, dtype): assert dtype == dtype.name assert dtype != "anonther_type" - def test_construct_from_string(self, dtype): - dtype_instance = type(dtype).construct_from_string(dtype.name) - assert isinstance(dtype_instance, type(dtype)) + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) def test_construct_from_string_another_type_raises(self, dtype): msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 78c49ae066288..9b84443c54cbb 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -245,13 +245,9 @@ def test_astype_str(self, data, request): class TestConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz) or pa.types.is_string( - pa_dtype - ): - if pa.types.is_string(pa_dtype): - reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')" - else: - reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}" + + if pa.types.is_string(pa_dtype): + reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')" request.node.add_marker( pytest.mark.xfail( reason=reason, @@ -577,65 +573,24 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request): class TestBaseDtype(base.BaseDtypeTests): def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype - if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", - ) - ) - elif pa.types.is_string(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason=( - "Still support StringDtype('pyarrow') " - "over ArrowDtype(pa.string())" - ), - ) - ) + + if pa.types.is_string(pa_dtype): + # We still support StringDtype('pyarrow') over ArrowDtype(pa.string()) + msg = r"string\[pyarrow\] should be constructed by StringDtype" + with pytest.raises(TypeError, match=msg): + dtype.construct_from_string(dtype.name) + + return + super().test_construct_from_string_own_name(dtype) def test_is_dtype_from_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype - if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", - ) - ) - elif pa.types.is_string(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=( - "Still support StringDtype('pyarrow') " - "over ArrowDtype(pa.string())" - ), - ) - ) - super().test_is_dtype_from_name(dtype) - - def test_construct_from_string(self, dtype, request): - pa_dtype = dtype.pyarrow_dtype - if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", - ) - ) - elif pa.types.is_string(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason=( - "Still support StringDtype('pyarrow') " - "over ArrowDtype(pa.string())" - ), - ) - ) - super().test_construct_from_string(dtype) + if pa.types.is_string(pa_dtype): + # We still support StringDtype('pyarrow') over ArrowDtype(pa.string()) + assert not type(dtype).is_dtype(dtype.name) + else: + super().test_is_dtype_from_name(dtype) def test_construct_from_string_another_type_raises(self, dtype): msg = r"'another_type' must end with '\[pyarrow\]'" @@ -720,13 +675,6 @@ def test_EA_types(self, engine, data, request): request.node.add_marker( pytest.mark.xfail(raises=TypeError, reason="GH 47534") ) - elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"Parameterized types with tz={pa_dtype.tz} not supported.", - ) - ) elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"): request.node.add_marker( pytest.mark.xfail( @@ -1354,8 +1302,9 @@ def test_invalid_other_comp(self, data, comparison_op): def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): - with pytest.raises(NotImplementedError, match="Passing pyarrow type"): - ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") + dtype = ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") + expected = ArrowDtype(pa.timestamp("s", "UTC")) + assert dtype == expected @pytest.mark.parametrize( From f2b72daba7a325aed5df7ce58d2cc98471a66e90 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Jan 2023 11:59:07 -0800 Subject: [PATCH 02/10] ENH: support any/all for pyarrow numeric and duration dtypes --- pandas/core/arrays/arrow/array.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 075beca106e6a..094490110afde 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -11,7 +11,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + Timedelta, + lib, +) from pandas._typing import ( ArrayLike, AxisInt, From 5236f7377ae9003839b3c2527ecc316097f5202b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 12 Jan 2023 14:32:29 -0800 Subject: [PATCH 03/10] mypy fixup --- pandas/core/arrays/arrow/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 094490110afde..4e03d5e9de4bb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1046,6 +1046,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # pyarrow only supports any/all for boolean dtype, we allow # for other dtypes, matching our non-pyarrow behavior + zero: int | Timedelta if pa.types.is_duration(pa_type): data_to_cmp = self._data.cast(pa.int64()) else: From 3ca05374c9395babbbe4a0b7beb3735da418970b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 21 Jan 2023 15:31:29 -0800 Subject: [PATCH 04/10] fix not_eq --- pandas/core/arrays/arrow/array.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4e03d5e9de4bb..075beca106e6a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -11,10 +11,7 @@ import numpy as np -from pandas._libs import ( - Timedelta, - lib, -) +from pandas._libs import lib from pandas._typing import ( ArrayLike, AxisInt, @@ -1046,7 +1043,6 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # pyarrow only supports any/all for boolean dtype, we allow # for other dtypes, matching our non-pyarrow behavior - zero: int | Timedelta if pa.types.is_duration(pa_type): data_to_cmp = self._data.cast(pa.int64()) else: From 46d459657b8186a5bbf292ef3ea36f60d6d5eb2c Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 11 Jan 2023 19:16:51 -0800 Subject: [PATCH 05/10] BUG: ArrowDtype.construct_from_string round-trip --- pandas/core/arrays/arrow/dtype.py | 39 ++++++++++++ pandas/tests/extension/base/dtype.py | 18 +++--- pandas/tests/extension/test_arrow.py | 91 ++++++---------------------- 3 files changed, 66 insertions(+), 82 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 90c86cd6d55ef..7955e2601059b 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -161,6 +161,14 @@ def construct_from_string(cls, string: str) -> ArrowDtype: except ValueError as err: has_parameters = re.search(r"\[.*\]", base_type) if has_parameters: + + # Fallback to try common temporal types + try: + return cls._parse_temporal_dtype_string(base_type) + except (NotImplementedError, ValueError): + # Fall through to raise with nice exception message below + pass + raise NotImplementedError( "Passing pyarrow type specific parameters " f"({has_parameters.group()}) in the string is not supported. " @@ -170,6 +178,37 @@ def construct_from_string(cls, string: str) -> ArrowDtype: raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err return cls(pa_dtype) + @classmethod + def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: + """ + Construct a temporal ArrowDtype from string. + """ + # we assume + # 1) "[pyarrow]" has already been stripped from the end of our string. + # 2) we know "[" is present + head, tail = string.split("[", 1) + + if not tail.endswith("]"): + raise ValueError + tail = tail[:-1] + + if head == "timestamp": + if "," not in tail: + tz = None + unit = tail + else: + unit, tz = tail.split(",", 1) + unit = unit.strip() + tz = tz.strip() + if tz.startswith("tz="): + tz = tz[3:] + + pa_type = pa.timestamp(unit, tz=tz) + dtype = cls(pa_type) + return dtype + + raise NotImplementedError(string) + @property def _is_numeric(self) -> bool: """ diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 2635343d73fd7..392a75f8a69a7 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -20,14 +20,6 @@ def test_kind(self, dtype): valid = set("biufcmMOSUV") assert dtype.kind in valid - def test_construct_from_string_own_name(self, dtype): - result = dtype.construct_from_string(dtype.name) - assert type(result) is type(dtype) - - # check OK as classmethod - result = type(dtype).construct_from_string(dtype.name) - assert type(result) is type(dtype) - def test_is_dtype_from_name(self, dtype): result = type(dtype).is_dtype(dtype.name) assert result is True @@ -97,9 +89,13 @@ def test_eq(self, dtype): assert dtype == dtype.name assert dtype != "anonther_type" - def test_construct_from_string(self, dtype): - dtype_instance = type(dtype).construct_from_string(dtype.name) - assert isinstance(dtype_instance, type(dtype)) + def test_construct_from_string_own_name(self, dtype): + result = dtype.construct_from_string(dtype.name) + assert type(result) is type(dtype) + + # check OK as classmethod + result = type(dtype).construct_from_string(dtype.name) + assert type(result) is type(dtype) def test_construct_from_string_another_type_raises(self, dtype): msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'" diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1dac8faa3a9e2..fa85390b2cfa4 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -248,13 +248,9 @@ def test_astype_str(self, data, request): class TestConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype - if (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz) or pa.types.is_string( - pa_dtype - ): - if pa.types.is_string(pa_dtype): - reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')" - else: - reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}" + + if pa.types.is_string(pa_dtype): + reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')" request.node.add_marker( pytest.mark.xfail( reason=reason, @@ -634,65 +630,24 @@ def test_in_numeric_groupby(self, data_for_grouping): class TestBaseDtype(base.BaseDtypeTests): def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype - if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", - ) - ) - elif pa.types.is_string(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason=( - "Still support StringDtype('pyarrow') " - "over ArrowDtype(pa.string())" - ), - ) - ) + + if pa.types.is_string(pa_dtype): + # We still support StringDtype('pyarrow') over ArrowDtype(pa.string()) + msg = r"string\[pyarrow\] should be constructed by StringDtype" + with pytest.raises(TypeError, match=msg): + dtype.construct_from_string(dtype.name) + + return + super().test_construct_from_string_own_name(dtype) def test_is_dtype_from_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype - if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", - ) - ) - elif pa.types.is_string(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason=( - "Still support StringDtype('pyarrow') " - "over ArrowDtype(pa.string())" - ), - ) - ) - super().test_is_dtype_from_name(dtype) - - def test_construct_from_string(self, dtype, request): - pa_dtype = dtype.pyarrow_dtype - if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}", - ) - ) - elif pa.types.is_string(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=TypeError, - reason=( - "Still support StringDtype('pyarrow') " - "over ArrowDtype(pa.string())" - ), - ) - ) - super().test_construct_from_string(dtype) + if pa.types.is_string(pa_dtype): + # We still support StringDtype('pyarrow') over ArrowDtype(pa.string()) + assert not type(dtype).is_dtype(dtype.name) + else: + super().test_is_dtype_from_name(dtype) def test_construct_from_string_another_type_raises(self, dtype): msg = r"'another_type' must end with '\[pyarrow\]'" @@ -783,13 +738,6 @@ def test_EA_types(self, engine, data, request): request.node.add_marker( pytest.mark.xfail(raises=TypeError, reason="GH 47534") ) - elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None: - request.node.add_marker( - pytest.mark.xfail( - raises=NotImplementedError, - reason=f"Parameterized types with tz={pa_dtype.tz} not supported.", - ) - ) elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"): request.node.add_marker( pytest.mark.xfail( @@ -1294,8 +1242,9 @@ def test_invalid_other_comp(self, data, comparison_op): def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): - with pytest.raises(NotImplementedError, match="Passing pyarrow type"): - ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") + dtype = ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") + expected = ArrowDtype(pa.timestamp("s", "UTC")) + assert dtype == expected @pytest.mark.parametrize( From dd221a0daf7169c7061cba1cf03829749d5cc70d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Feb 2023 16:28:19 -0800 Subject: [PATCH 06/10] TST: fix pyarrow xfails with date/time types --- pandas/_libs/missing.pyx | 19 +++++++--- pandas/core/arrays/arrow/array.py | 8 +++++ pandas/tests/extension/test_arrow.py | 54 +++++++++++++--------------- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index e6516b004a973..0db99e5f66f30 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,13 +1,14 @@ -from decimal import Decimal -import numbers -from sys import maxsize - -cimport cython from cpython.datetime cimport ( date, time, timedelta, ) + +from decimal import Decimal +import numbers +from sys import maxsize + +cimport cython from cython cimport Py_ssize_t import numpy as np @@ -338,6 +339,14 @@ def _create_binary_propagating_op(name, is_divmod=False): elif is_cmp and isinstance(other, (date, time, timedelta)): return NA + elif isinstance(other, date): + if name in ["__sub__", "__rsub__"]: + return NA + + elif isinstance(other, timedelta): + if name in ["__sub__", "__rsub__", "__add__", "__radd__"]: + return NA + return NotImplemented method.__name__ = name diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 075beca106e6a..c658b9aa844c3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1319,6 +1319,14 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra Sorted, if possible. """ pa_type = self._data.type + if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): + # GH#50982 we can get the behavior that issue expects by passing + # dropna to value_counts. For now we do not pass the keyword in + # order to get the behavior the tests expect. + vcs = self.value_counts() + res_ser = vcs[vcs == vcs.max()].sort_index() + return res_ser.index._values + if pa.types.is_temporal(pa_type): nbits = pa_type.bit_width if nbits == 32: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fa85390b2cfa4..79e04b87b8840 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -422,7 +422,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques raises=NotImplementedError, ) ) - elif all_numeric_accumulations == "cumsum" and (pa.types.is_boolean(pa_type)): + elif all_numeric_accumulations == "cumsum" and pa.types.is_boolean(pa_type): request.node.add_marker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", @@ -897,17 +897,7 @@ def test_factorize(self, data_for_grouping, request): def test_combine_add(self, data_repeated, request): pa_dtype = next(data_repeated(1)).dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): - # TODO: this fails on the scalar addition constructing 'expected' - # but not in the actual 'combine' call, so may be salvage-able - mark = pytest.mark.xfail( - raises=TypeError, - reason=f"{pa_dtype} cannot be added to {pa_dtype}", - ) - request.node.add_marker(mark) - super().test_combine_add(data_repeated) - - elif pa.types.is_temporal(pa_dtype): + if pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype): # analogous to datetime64, these cannot be added orig_data1, orig_data2 = data_repeated(2) s1 = pd.Series(orig_data1) @@ -954,14 +944,24 @@ def _patch_combine(self, obj, other, op): pa_expected = pa.array(expected_data._values) if pa.types.is_duration(pa_expected.type): - # pyarrow sees sequence of datetime/timedelta objects and defaults - # to "us" but the non-pointwise op retains unit - unit = original_dtype.pyarrow_dtype.unit - if type(other) in [datetime, timedelta] and unit in ["s", "ms"]: - # pydatetime/pytimedelta objects have microsecond reso, so we - # take the higher reso of the original and microsecond. Note - # this matches what we would do with DatetimeArray/TimedeltaArray - unit = "us" + orig_pa_type = original_dtype.pyarrow_dtype + if pa.types.is_date(orig_pa_type): + if pa.types.is_date64(orig_pa_type): + # TODO: why is this different vs date32? + unit = "ms" + else: + unit = "s" + else: + # pyarrow sees sequence of datetime/timedelta objects and defaults + # to "us" but the non-pointwise op retains unit + # timestamp or duration + unit = orig_pa_type.unit + if type(other) in [datetime, timedelta] and unit in ["s", "ms"]: + # pydatetime/pytimedelta objects have microsecond reso, so we + # take the higher reso of the original and microsecond. Note + # this matches what we would do with DatetimeArray/TimedeltaArray + unit = "us" + pa_expected = pa_expected.cast(f"duration[{unit}]") else: pa_expected = pa_expected.cast(original_dtype.pyarrow_dtype) @@ -1014,7 +1014,7 @@ def _get_arith_xfail_marker(self, opname, pa_dtype): f"for {pa_dtype}" ) ) - elif arrow_temporal_supported: + elif arrow_temporal_supported and pa.types.is_time(pa_dtype): mark = pytest.mark.xfail( raises=TypeError, reason=( @@ -1059,6 +1059,7 @@ def test_arith_series_with_scalar( ) or pa.types.is_duration(pa_dtype) or pa.types.is_timestamp(pa_dtype) + or pa.types.is_date(pa_dtype) ): # BaseOpsUtil._combine always returns int64, while ArrowExtensionArray does # not upcast @@ -1090,6 +1091,7 @@ def test_arith_frame_with_scalar( ) or pa.types.is_duration(pa_dtype) or pa.types.is_timestamp(pa_dtype) + or pa.types.is_date(pa_dtype) ): # BaseOpsUtil._combine always returns int64, while ArrowExtensionArray does # not upcast @@ -1142,6 +1144,7 @@ def test_arith_series_with_array( ) or pa.types.is_duration(pa_dtype) or pa.types.is_timestamp(pa_dtype) + or pa.types.is_date(pa_dtype) ): monkeypatch.setattr(TestBaseArithmeticOps, "_combine", self._patch_combine) self.check_opname(ser, op_name, other, exc=self.series_array_exc) @@ -1327,14 +1330,7 @@ def test_quantile(data, interpolation, quantile, request): ) def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"mode not supported by pyarrow for {pa_dtype}", - ) - ) - elif ( + if ( pa.types.is_boolean(pa_dtype) and "multi_mode" in request.node.nodeid and pa_version_under9p0 From 95bf2ebafaf786a14f85c480bc9b8ea6f5f42b6f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Feb 2023 16:30:41 -0800 Subject: [PATCH 07/10] trim diff --- pandas/_libs/missing.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 0db99e5f66f30..8df73398f1e4a 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,14 +1,13 @@ -from cpython.datetime cimport ( - date, - time, - timedelta, -) - from decimal import Decimal import numbers from sys import maxsize cimport cython +from cpython.datetime cimport ( + date, + time, + timedelta, +) from cython cimport Py_ssize_t import numpy as np From a2b655157da90271352c81ef8b340d99cb385364 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Feb 2023 20:13:07 -0800 Subject: [PATCH 08/10] lint fixup --- pandas/core/arrays/arrow/dtype.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 402281f3b312a..6bcf2e2f265c2 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -203,7 +203,6 @@ def construct_from_string(cls, string: str) -> ArrowDtype: except ValueError as err: has_parameters = re.search(r"\[.*\]", base_type) if has_parameters: - # Fallback to try common temporal types try: return cls._parse_temporal_dtype_string(base_type) From a2a236f5306f8a3d1802c2e7bd2887f1f9fd9443 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Feb 2023 15:45:38 -0800 Subject: [PATCH 09/10] suggestions --- pandas/core/arrays/arrow/dtype.py | 17 +++++++---------- pandas/tests/extension/test_arrow.py | 4 ++++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 402281f3b312a..fdb9ac877831b 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -203,7 +203,6 @@ def construct_from_string(cls, string: str) -> ArrowDtype: except ValueError as err: has_parameters = re.search(r"\[.*\]", base_type) if has_parameters: - # Fallback to try common temporal types try: return cls._parse_temporal_dtype_string(base_type) @@ -220,6 +219,7 @@ def construct_from_string(cls, string: str) -> ArrowDtype: raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err return cls(pa_dtype) + # TODO(arrow#33642): This can be removed once supported by pyarrow @classmethod def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: """ @@ -235,15 +235,12 @@ def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype: tail = tail[:-1] if head == "timestamp": - if "," not in tail: - tz = None - unit = tail - else: - unit, tz = tail.split(",", 1) - unit = unit.strip() - tz = tz.strip() - if tz.startswith("tz="): - tz = tz[3:] + assert "," in tail # otherwise type_for_alias should work + unit, tz = tail.split(",", 1) + unit = unit.strip() + tz = tz.strip() + if tz.startswith("tz="): + tz = tz[3:] pa_type = pa.timestamp(unit, tz=tz) dtype = cls(pa_type) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 87631bfe02496..c53b6e6658979 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1213,6 +1213,10 @@ def test_invalid_other_comp(self, data, comparison_op): def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): + with pytest.raises(NotImplementedError, match="Passing pyarrow type"): + ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]") + + # but as of GH#50689, timestamptz is supported dtype = ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") expected = ArrowDtype(pa.timestamp("s", "UTC")) assert dtype == expected From b1698fbc112faea223515bc3a7df43abb5416cad Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 18 Feb 2023 12:49:17 -0800 Subject: [PATCH 10/10] remove special-casing --- pandas/core/arrays/arrow/array.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7b41473ae80b1..eeb252b10b1ea 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1377,14 +1377,6 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra Sorted, if possible. """ pa_type = self._data.type - if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type): - # GH#50982 we can get the behavior that issue expects by passing - # dropna to value_counts. For now we do not pass the keyword in - # order to get the behavior the tests expect. - vcs = self.value_counts() - res_ser = vcs[vcs == vcs.max()].sort_index() - return res_ser.index._values - if pa.types.is_temporal(pa_type): nbits = pa_type.bit_width if nbits == 32: