From 9e30b019d6e0e47426b06cec3e37f4b961a55a25 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Mar 2023 10:13:05 +0000 Subject: [PATCH 1/2] BUG: indexing empty pyarrow backed object returning corrupt object (#51741) --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_testing/__init__.py | 1 + pandas/core/arrays/arrow/array.py | 7 ++++- pandas/tests/extension/test_arrow.py | 38 ++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 12eb2375b69e1..317eca7dc8723 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1247,6 +1247,7 @@ Indexing - Bug in :meth:`DataFrame.compare` does not recognize differences when comparing ``NA`` with value in nullable dtypes (:issue:`48939`) - Bug in :meth:`Series.rename` with :class:`MultiIndex` losing extension array dtypes (:issue:`21055`) - Bug in :meth:`DataFrame.isetitem` coercing extension array dtypes in :class:`DataFrame` to object (:issue:`49922`) +- Bug in :meth:`Series.__getitem__` returning corrupt object when selecting from an empty pyarrow backed object (:issue:`51734`) - Bug in :class:`BusinessHour` would cause creation of :class:`DatetimeIndex` to fail when no opening hour was included in the index (:issue:`49835`) Missing diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 69ca809e4f498..f9add5c2c5d88 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -252,6 +252,7 @@ else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] + ALL_PYARROW_DTYPES = [] EMPTY_STRING_PATTERN = re.compile("^$") diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4e0dd6b75d46a..fbd7626c8637d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1010,7 +1010,12 @@ def _concat_same_type( ArrowExtensionArray """ chunks = [array for ea in to_concat for array in ea._data.iterchunks()] - arr = pa.chunked_array(chunks) + if to_concat[0].dtype == "string": + # StringDtype has no attrivute pyarrow_dtype + pa_dtype = pa.string() + else: + pa_dtype = to_concat[0].dtype.pyarrow_dtype + arr = pa.chunked_array(chunks, type=pa_dtype) return cls(arr) def _accumulate( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index cedaaa500736b..1559f41a42cba 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2298,3 +2298,41 @@ def test_dt_tz_localize(unit): dtype=ArrowDtype(pa.timestamp(unit, "US/Pacific")), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): + # GH51624 + ser = pd.Series([None], dtype="float64[pyarrow]") + result = getattr(ser, all_boolean_reductions)(skipna=skipna) + if skipna: + expected = all_boolean_reductions == "all" + else: + expected = pd.NA + assert result is expected + + +def test_from_sequence_of_strings_boolean(): + true_strings = ["true", "TRUE", "True", "1", "1.0"] + false_strings = ["false", "FALSE", "False", "0", "0.0"] + nulls = [None] + strings = true_strings + false_strings + nulls + bools = ( + [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls) + ) + + result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + expected = pd.array(bools, dtype="boolean[pyarrow]") + tm.assert_extension_array_equal(result, expected) + + strings = ["True", "foo"] + with pytest.raises(pa.ArrowInvalid, match="Failed to parse"): + ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) + + +def test_concat_empty_arrow_backed_series(dtype): + # GH#51734 + ser = pd.Series([], dtype=dtype) + expected = ser.copy() + result = pd.concat([ser[np.array([], dtype=np.bool_)]]) + tm.assert_series_equal(result, expected) From e7408b72045c94ca640491dcc7cc22f00295351c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Mar 2023 13:23:04 +0000 Subject: [PATCH 2/2] Remove tests --- pandas/tests/extension/test_arrow.py | 30 ---------------------------- 1 file changed, 30 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1559f41a42cba..91a96c8154779 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2300,36 +2300,6 @@ def test_dt_tz_localize(unit): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("skipna", [True, False]) -def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna): - # GH51624 - ser = pd.Series([None], dtype="float64[pyarrow]") - result = getattr(ser, all_boolean_reductions)(skipna=skipna) - if skipna: - expected = all_boolean_reductions == "all" - else: - expected = pd.NA - assert result is expected - - -def test_from_sequence_of_strings_boolean(): - true_strings = ["true", "TRUE", "True", "1", "1.0"] - false_strings = ["false", "FALSE", "False", "0", "0.0"] - nulls = [None] - strings = true_strings + false_strings + nulls - bools = ( - [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls) - ) - - result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) - expected = pd.array(bools, dtype="boolean[pyarrow]") - tm.assert_extension_array_equal(result, expected) - - strings = ["True", "foo"] - with pytest.raises(pa.ArrowInvalid, match="Failed to parse"): - ArrowExtensionArray._from_sequence_of_strings(strings, dtype=pa.bool_()) - - def test_concat_empty_arrow_backed_series(dtype): # GH#51734 ser = pd.Series([], dtype=dtype)