From 6accf2be0000f2ce6e1c6a5091fd3ea5d32620d9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 22 Oct 2023 23:33:54 +0200 Subject: [PATCH 01/30] BUG: setitem casting object Index to arrow strings --- doc/source/whatsnew/v2.1.2.rst | 2 ++ pandas/core/construction.py | 12 ++++++++++++ pandas/core/indexes/base.py | 10 +++++++++- pandas/tests/frame/indexing/test_setitem.py | 9 +++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..0aff745d1c7f0 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,9 +24,11 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) +- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) +- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b4b9a4176472d..b0241f950662d 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -49,6 +49,7 @@ from pandas.core.dtypes.common import ( is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -547,6 +548,10 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype + object_index = False + if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: + object_index = True + # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -596,6 +601,13 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + if ( + object_index + and using_pyarrow_string_dtype() + and is_string_dtype(subarr) + ): + # Avoid inference when string option is set + subarr = data elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 252e88d7c7d51..4414f358b39bb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,6 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, + using_pyarrow_string_dtype, ) from pandas._libs import ( @@ -6916,7 +6917,14 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - return Index._with_infer(new_values, name=self.name) + idx = Index._with_infer(new_values, name=self.name) + if ( + using_pyarrow_string_dtype() + and is_string_dtype(idx.dtype) + and new_values.dtype == object + ): + idx = idx.astype(new_values.dtype) + return idx def drop( self, diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ccc1249088f9a..fc2e817b1600e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -753,6 +753,15 @@ def test_setitem_frame_overwrite_with_ea_dtype(self, any_numeric_ea_dtype): ) tm.assert_frame_equal(df, expected) + def test_setitem_string_option_object_index(self): + # GH#55638 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}) + with pd.option_context("future.infer_string", True): + df["b"] = Index(["a", "b"], dtype=object) + expected = DataFrame({"a": [1, 2], "b": Series(["a", "b"], dtype=object)}) + tm.assert_frame_equal(df, expected) + def test_setitem_frame_midx_columns(self): # GH#49121 df = DataFrame({("a", "b"): [10]}) From 61f49034ff98d4a5014f238c11c5d5ad9a84e49b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 23 Oct 2023 00:08:28 +0200 Subject: [PATCH 02/30] Fix --- pandas/tests/frame/indexing/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index de8df15a9d747..51b0a0a13d90b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1913,7 +1913,7 @@ def test_add_new_column_infer_string(): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype=object), ) tm.assert_frame_equal(df, expected) From 2effd1f304a54ee76cb6a864e6a340c7f1934eb4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Nov 2023 20:43:06 -0400 Subject: [PATCH 03/30] Start fixing index tests --- pandas/conftest.py | 8 ++++++++ pandas/core/config_init.py | 2 +- pandas/tests/indexes/test_base.py | 26 ++++++++++++++++++-------- pandas/tests/indexes/test_old_base.py | 1 + 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index efe263a41afe1..df2cad430bf58 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2008,6 +2008,14 @@ def using_copy_on_write() -> bool: ) +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if Copy-on-Write is enabled. + """ + return pd.options.future.infer_string is True + + @pytest.fixture def warn_copy_on_write() -> bool: """ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8b63f97141c2..bdbab78a443de 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - False, + True, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0c32149124ac6..1d3379ad29750 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -70,12 +70,17 @@ def test_constructor_casting(self, index): tm.assert_index_equal(index, new_index) @pytest.mark.parametrize("index", ["string"], indirect=True) - def test_constructor_copy(self, index): + def test_constructor_copy(self, index, using_infer_string): arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) assert new_index.name == "name" - tm.assert_numpy_array_equal(arr, new_index.values) + if using_infer_string: + tm.assert_extension_array_equal( + new_index.values, pd.array(arr, dtype="string[pyarrow_numpy]") + ) + else: + tm.assert_numpy_array_equal(arr, new_index.values) arr[0] = "SOMEBIGLONGSTRING" assert new_index[0] != "SOMEBIGLONGSTRING" @@ -143,7 +148,7 @@ def test_constructor_from_series_freq(self): tm.assert_index_equal(result, expected) - def test_constructor_from_frame_series_freq(self): + def test_constructor_from_frame_series_freq(self, using_infer_string): # GH 6273 # create from a series, passing a freq dts = ["1-1-1990", "2-1-1990", "3-1-1990", "4-1-1990", "5-1-1990"] @@ -152,8 +157,8 @@ def test_constructor_from_frame_series_freq(self): df = DataFrame(np.random.default_rng(2).random((5, 3))) df["date"] = dts result = DatetimeIndex(df["date"], freq="MS") - - assert df["date"].dtype == object + dtype = object if not using_infer_string else "string" + assert df["date"].dtype == dtype expected.name = "date" tm.assert_index_equal(result, expected) @@ -344,6 +349,9 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") + elif index.dtype == "string": + with pytest.raises(NotImplementedError, match="i8"): + index.view("i8") else: msg = "Cannot change data-type for object array" with pytest.raises(TypeError, match=msg): @@ -477,7 +485,7 @@ def test_empty_fancy_raises(self, index): assert index[[]].identical(empty_index) # np.ndarray only accepts ndarray of int & bool dtypes, so should Index - msg = r"arrays used as indices must be of integer \(or boolean\) type" + msg = r"arrays used as indices must be of integer" with pytest.raises(IndexError, match=msg): index[empty_farr] @@ -653,7 +661,9 @@ def test_is_numeric(self, index, expected): ], indirect=["index"], ) - def test_is_object(self, index, expected): + def test_is_object(self, index, expected, using_infer_string): + if using_infer_string and index.dtype == "string" and expected: + expected = False assert is_object_dtype(index) is expected def test_summary(self, index): @@ -773,7 +783,7 @@ def test_drop_by_numeric_label_errors_ignore(self, key, expected): def test_drop_tuple(self, values, to_drop): # GH 18304 index = Index(values) - expected = Index(["b"]) + expected = Index(["b"], dtype=object) result = index.drop(to_drop) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 32adbc693390b..cd6bb99b5e647 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -413,6 +413,7 @@ def test_insert_base(self, index): # test 0th element assert index[0:4].equals(result.insert(0, index[0])) + @pytest.mark.skip() def test_insert_out_of_bounds(self, index): # TypeError/IndexError matches what np.insert raises in these cases From 6569663010e6e134dc8853452d2ef7e2b0da9ab1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Nov 2023 20:46:44 -0400 Subject: [PATCH 04/30] BUG: Index.isin raising for arrow strings and null set --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/arrays/string_arrow.py | 4 +++- pandas/tests/indexes/test_base.py | 12 ++++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 3b1cd1c152baa..31ab01f171b4a 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- +- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ae020ec2f599f..1c2ecd4684e2f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -222,7 +222,9 @@ def isin(self, values) -> npt.NDArray[np.bool_]: if not len(value_set): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._pa_array, value_set=pa.array(value_set)) + result = pc.is_in( + self._pa_array, value_set=pa.array(value_set, type=self._pa_array.type) + ) # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 0c32149124ac6..828700573c7ea 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -9,7 +9,7 @@ from pandas.compat import IS64 from pandas.errors import InvalidIndexError -from pandas.util._test_decorators import async_mark +import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( is_any_real_numeric_dtype, @@ -921,6 +921,14 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) + @td.skip_if_no("pyarrow") + def test_isin_arrow_string_null(self): + # GH#55821 + index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + result = index.isin([None]) + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "values", [ @@ -1235,7 +1243,7 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - @async_mark() + @td.async_mark() async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") From cdbdba2993ba2f633ef1b070f3434533c6b33044 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Nov 2023 21:00:26 -0400 Subject: [PATCH 05/30] Fix more tests --- pandas/tests/indexes/test_base.py | 19 ++++++++++++++----- pandas/tests/indexes/test_old_base.py | 4 ++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index b8756fce3dd95..0eb9f9cdc5072 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -836,9 +836,12 @@ def test_isin(self, values, index, expected): result = index.isin(values) tm.assert_numpy_array_equal(result, expected) - def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): + def test_isin_nan_common_object( + self, nulls_fixture, nulls_fixture2, using_infer_string + ): # Test cartesian product of null fixtures and ensure that we don't # mangle the various types (save a corner case with PyPy) + idx = Index(["a", nulls_fixture]) # all nans are the same if ( @@ -848,19 +851,25 @@ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): and math.isnan(nulls_fixture2) ): tm.assert_numpy_array_equal( - Index(["a", nulls_fixture]).isin([nulls_fixture2]), + idx.isin([nulls_fixture2]), np.array([False, True]), ) elif nulls_fixture is nulls_fixture2: # should preserve NA type tm.assert_numpy_array_equal( - Index(["a", nulls_fixture]).isin([nulls_fixture2]), + idx.isin([nulls_fixture2]), + np.array([False, True]), + ) + + elif using_infer_string and idx.dtype == "string": + tm.assert_numpy_array_equal( + idx.isin([nulls_fixture2]), np.array([False, True]), ) else: tm.assert_numpy_array_equal( - Index(["a", nulls_fixture]).isin([nulls_fixture2]), + idx.isin([nulls_fixture2]), np.array([False, False]), ) @@ -1136,7 +1145,7 @@ def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name, labels) def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, labels): # GH7774 index = Index(list("abc")) - assert index.reindex(labels)[0].dtype.type == np.object_ + assert index.reindex(labels)[0].dtype.type == index.dtype.type @pytest.mark.parametrize( "labels,dtype", diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index cd6bb99b5e647..9b945c5f7a3a1 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -209,7 +209,7 @@ def test_numeric_compat(self, simple_index): 1 // idx def test_logical_compat(self, simple_index): - if simple_index.dtype == object: + if simple_index.dtype in (object, "string"): pytest.skip("Tested elsewhere.") idx = simple_index if idx.dtype.kind in "iufcbm": @@ -295,7 +295,7 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype == "string[pyarrow]": + elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) From 57cb8f43a9942199b6f4d512ecf22baedce754c2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Nov 2023 21:09:40 -0400 Subject: [PATCH 06/30] TST: Fix shares_memory for arrow string dtype --- pandas/_testing/__init__.py | 13 +++++++++++-- pandas/tests/util/test_shares_memory.py | 16 ++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 81cd504119c38..ee173c6892743 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -30,6 +30,7 @@ is_float_dtype, is_sequence, is_signed_integer_dtype, + is_string_dtype, is_unsigned_integer_dtype, pandas_dtype, ) @@ -1044,10 +1045,18 @@ def shares_memory(left, right) -> bool: if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]": + if ( + isinstance(left, ExtensionArray) + and is_string_dtype(left.dtype) + and left.dtype.storage in ("pyarrow", "pyarrow_numpy") + ): # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left = cast("ArrowExtensionArray", left) - if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]": + if ( + isinstance(right, ExtensionArray) + and is_string_dtype(right.dtype) + and right.dtype.storage in ("pyarrow", "pyarrow_numpy") + ): right = cast("ArrowExtensionArray", right) left_pa_data = left._pa_array right_pa_data = right._pa_array diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index ed8227a5c4307..e4dca0f36b111 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -11,3 +13,17 @@ def test_shares_memory_interval(): assert tm.shares_memory(obj, obj[:2]) assert not tm.shares_memory(obj, obj._data.copy()) + + +@td.skip_if_no("pyarrow") +def test_shares_memory_string(): + import pyarrow as pa + + obj = pd.array(["a", "b"], dtype="string[pyarrow]") + assert tm.shares_memory(obj, obj) + + obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + assert tm.shares_memory(obj, obj) + + obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) + assert tm.shares_memory(obj, obj) From 7697059e5b653f74ae5f3962ec02f5e1b715aa2d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Nov 2023 21:09:51 -0400 Subject: [PATCH 07/30] TST: Fix shares_memory for arrow string dtype --- pandas/tests/util/test_shares_memory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index e4dca0f36b111..090b04d5b6b01 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -17,6 +17,7 @@ def test_shares_memory_interval(): @td.skip_if_no("pyarrow") def test_shares_memory_string(): + # GH#55822 import pyarrow as pa obj = pd.array(["a", "b"], dtype="string[pyarrow]") From 285f7bbf1436cff70fc7701bd6b172bf905e2d66 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Nov 2023 21:10:22 -0400 Subject: [PATCH 08/30] TST: Fix shares_memory for arrow string dtype --- pandas/tests/util/test_shares_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 090b04d5b6b01..00a897d574a07 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -17,7 +17,7 @@ def test_shares_memory_interval(): @td.skip_if_no("pyarrow") def test_shares_memory_string(): - # GH#55822 + # GH#55823 import pyarrow as pa obj = pd.array(["a", "b"], dtype="string[pyarrow]") From f59c4bc41fe11b42a0fb88f977ddf994e48d5037 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 4 Nov 2023 22:43:01 -0400 Subject: [PATCH 09/30] Fix more tests --- pandas/tests/indexes/base_class/test_setops.py | 2 +- pandas/tests/indexes/object/test_indexing.py | 2 +- .../tests/indexes/period/methods/test_astype.py | 2 +- pandas/tests/indexes/test_old_base.py | 15 +++++++++++---- .../indexes/timedeltas/methods/test_astype.py | 2 +- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index 488f79eea0d11..b6c318c1b1650 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -147,7 +147,7 @@ def test_intersection_str_dates(self, sort): def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique index1 = Index(["A", "B", "A", "C"]) - expected = Index(expected_arr, dtype="object") + expected = Index(expected_arr) result = index1.intersection(index2, sort=sort) if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..a6f48cf041060 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -172,7 +172,7 @@ def test_slice_locs_negative_step(self, in_slice, expected): s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=result.dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index 07595b6b8c1dd..e886f923f9656 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -41,7 +41,7 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx") + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 9b945c5f7a3a1..bd3bdfe4bbce0 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -691,7 +691,7 @@ def test_map_str(self, simple_index): pytest.skip("See test_map.py") idx = simple_index result = idx.map(str) - expected = Index([str(x) for x in idx], dtype=object) + expected = Index([str(x) for x in idx]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("copy", [True, False]) @@ -827,7 +827,7 @@ def test_append_preserves_dtype(self, simple_index): alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) - def test_inv(self, simple_index): + def test_inv(self, simple_index, using_infer_string): idx = simple_index if idx.dtype.kind in ["i", "u"]: @@ -840,14 +840,21 @@ def test_inv(self, simple_index): tm.assert_series_equal(res2, Series(expected)) else: if idx.dtype.kind == "f": + err = TypeError msg = "ufunc 'invert' not supported for the input types" + elif using_infer_string and idx.dtype == "string": + import pyarrow as pa + + err = pa.lib.ArrowNotImplementedError + msg = "has no kernel" else: + err = TypeError msg = "bad operand" - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ~idx # check that we get the same behavior with Series - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ~Series(idx) def test_is_boolean_is_deprecated(self, simple_index): diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 6aeb1fc23dae9..311f2b5c9aa59 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -61,7 +61,7 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx") + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) From ff6e271961434063b277e723e918cc7f3cbc5dea Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 4 Nov 2023 22:47:43 -0400 Subject: [PATCH 10/30] BUG: Index.getitem returning wrong result with negative step for arrow --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/arrays/arrow/array.py | 7 +++++++ pandas/tests/indexes/object/test_indexing.py | 14 +++++++++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 3b1cd1c152baa..f4c32b6ecc056 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4bcc03643dac8..43927f554a875 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -553,6 +553,13 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + if item.start == item.stop: + pass + elif item.start == -len(self) - 1: + item = slice(None, item.stop, item.step) + elif item.stop == -len(self) - 1: + item = slice(item.start, None, item.step) value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..7a6e93a4605d7 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -144,6 +145,13 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -167,12 +175,12 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list("bcdxy")) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): From 7684911a70609559f7efcb84c73662ba0579a169 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:18:25 -0500 Subject: [PATCH 11/30] Update --- pandas/core/arrays/arrow/array.py | 5 +++-- pandas/tests/indexes/object/test_indexing.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 43927f554a875..e073b8c20acbe 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,10 +556,11 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start == -len(self) - 1: + elif item.start <= -len(self) - 1: item = slice(None, item.stop, item.step) - elif item.stop == -len(self) - 1: + elif item.stop <= -len(self) - 1: item = slice(item.start, None, item.step) + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 7a6e93a4605d7..93d46ebdd0b51 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -183,6 +183,17 @@ def test_slice_locs_negative_step(self, in_slice, expected, dtype): expected = Index(list(expected), dtype=dtype) tm.assert_index_equal(result, expected) + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + tm.assert_index_equal(result, expected) + def test_slice_locs_dup(self): index = Index(["a", "a", "b", "c", "d", "d"]) assert index.slice_locs("a", "d") == (0, 6) From 7474cb2b70e548a6000dd797c8f5c524b08624cb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:21:00 -0500 Subject: [PATCH 12/30] Update --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e073b8c20acbe..87d355ef79142 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,9 +556,9 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start <= -len(self) - 1: + elif item.start < -len(self): item = slice(None, item.stop, item.step) - elif item.stop <= -len(self) - 1: + elif item.stop < -len(self): item = slice(item.start, None, item.step) value = self._pa_array[item] From b27b0f82b85e20d0c3ae724acbece252999a8b26 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 09:26:59 -0500 Subject: [PATCH 13/30] Fix --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 87d355ef79142..9a6b91f21d90f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,9 +556,9 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start < -len(self): + elif item.start is not None and item.start < -len(self): item = slice(None, item.stop, item.step) - elif item.stop < -len(self): + elif item.stop is not None and item.stop < -len(self): item = slice(item.start, None, item.step) value = self._pa_array[item] From 659577e6c43dfe5125b865f707ec537f66b89cd3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 5 Nov 2023 10:13:53 -0500 Subject: [PATCH 14/30] Update array.py --- pandas/core/arrays/arrow/array.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9a6b91f21d90f..7e5452a21af9a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,8 +556,6 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.start is not None and item.start < -len(self): - item = slice(None, item.stop, item.step) elif item.stop is not None and item.stop < -len(self): item = slice(item.start, None, item.step) From 11c0e866c4ca22d2458f6858fe544663cc1b7758 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 15 Nov 2023 20:59:52 +0100 Subject: [PATCH 15/30] Fix --- pandas/tests/indexes/object/test_astype.py | 2 +- pandas/tests/indexes/object/test_indexing.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 273b39b5e319d..9c1ef302c5b51 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -20,7 +20,7 @@ def test_astype_str_from_bytes(): # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected) + expected = Series(expected, dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 93d46ebdd0b51..68fc707e5c816 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs.missing import is_matching_na import pandas.util._test_decorators as td @@ -58,6 +60,7 @@ def test_get_indexer_with_NA_values( class TestGetIndexerNonUnique: + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="NAs are cast to NaN") def test_get_indexer_non_unique_nas(self, nulls_fixture): # even though this isn't non-unique, this should still work index = Index(["a", "b", nulls_fixture]) From f55b62fd0e38cc3b5076becff6d31e0c3a8c17d1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Nov 2023 00:18:44 +0100 Subject: [PATCH 16/30] Move --- doc/source/whatsnew/v2.1.2.rst | 1 - doc/source/whatsnew/v2.1.4.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 68ecd8713390a..f25af040900b1 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -41,7 +41,6 @@ Bug fixes - Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) -- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 04bbb0f806cbd..954f04ffc621d 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -21,7 +21,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - .. --------------------------------------------------------------------------- From ea3fcfbe6207152558e68df2bfce24ff622c9a76 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Nov 2023 00:19:11 +0100 Subject: [PATCH 17/30] Move --- doc/source/whatsnew/v2.1.2.rst | 1 - doc/source/whatsnew/v2.1.4.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index f25af040900b1..38416afc1c94c 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -44,7 +44,6 @@ Bug fixes - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) -- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 954f04ffc621d..400448260b0ae 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - .. --------------------------------------------------------------------------- From 86fe4f129a648fa2f765e9f3587d8829eedbc7ff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 17 Nov 2023 00:35:46 +0100 Subject: [PATCH 18/30] Fix --- pandas/core/arrays/arrow/array.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7e5452a21af9a..820a3856d48fc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -556,7 +556,12 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, slice): if item.start == item.stop: pass - elif item.stop is not None and item.stop < -len(self): + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): item = slice(item.start, None, item.step) value = self._pa_array[item] From 242552937fe6d8e12c2af90e2d55b91a9d9bdb5c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 17 Nov 2023 19:35:18 +0100 Subject: [PATCH 19/30] Add gh ref --- pandas/core/arrays/arrow/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 820a3856d48fc..d162b66e5d369 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -554,6 +554,7 @@ def __getitem__(self, item: PositionalIndexer): # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 if item.start == item.stop: pass elif ( From 84e815e29e8f020b9386546616cc86c00e5f2162 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 18 Nov 2023 00:47:54 +0100 Subject: [PATCH 20/30] Update v2.1.4.rst --- doc/source/whatsnew/v2.1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index bdaf5bdee03f5..2a04adf2ac7f7 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -21,9 +21,9 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) - .. --------------------------------------------------------------------------- From c4d9ba93b627ee1dc0af5044cd7b403bcf45c5a5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 20 Nov 2023 01:03:12 +0100 Subject: [PATCH 21/30] Finish --- pandas/core/config_init.py | 2 +- pandas/core/indexes/base.py | 2 +- .../tests/indexes/base_class/test_formats.py | 3 ++ .../tests/indexes/base_class/test_reshape.py | 10 ++-- .../tests/indexes/categorical/test_astype.py | 2 +- .../indexes/categorical/test_category.py | 5 +- .../tests/indexes/categorical/test_formats.py | 4 ++ .../tests/indexes/categorical/test_reindex.py | 2 +- .../indexes/datetimes/methods/test_map.py | 2 +- pandas/tests/indexes/interval/test_formats.py | 7 ++- .../tests/indexes/multi/test_constructors.py | 7 ++- pandas/tests/indexes/multi/test_get_set.py | 16 +++--- pandas/tests/indexes/multi/test_reindex.py | 7 ++- pandas/tests/indexes/multi/test_setops.py | 20 ++++--- pandas/tests/indexes/object/test_indexing.py | 54 +++++++++++++------ pandas/tests/indexes/test_base.py | 26 ++++++--- pandas/tests/indexes/test_old_base.py | 8 +-- pandas/tests/indexes/test_setops.py | 4 +- 18 files changed, 123 insertions(+), 58 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bdbab78a443de..a8b63f97141c2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - True, + False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c90a563899a45..5e4fe041fadf0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6916,7 +6916,7 @@ def insert(self, loc: int, item) -> Index: and is_string_dtype(idx.dtype) and new_values.dtype == object ): - idx = idx.astype(new_values.dtype) + idx = Index(new_values, name=self.name, dtype=new_values.dtype) return idx def drop( diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py index 379aea8826414..f30b578cfcf56 100644 --- a/pandas/tests/indexes/base_class/test_formats.py +++ b/pandas/tests/indexes/base_class/test_formats.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import Index @@ -15,6 +16,7 @@ def test_repr_is_valid_construction_code(self): res = eval(repr(idx)) tm.assert_index_equal(res, idx) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ @@ -79,6 +81,7 @@ def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") @pytest.mark.parametrize( "index,expected", [ diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index 6586f5f9de480..814a6a516904b 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -33,13 +33,15 @@ def test_insert(self): # test empty null_index = Index([]) - tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) + tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a")) - def test_insert_missing(self, nulls_fixture): + def test_insert_missing(self, nulls_fixture, using_infer_string): # GH#22295 # test there is no mangling of NA values - expected = Index(["a", nulls_fixture, "b", "c"]) - result = Index(list("abc")).insert(1, nulls_fixture) + expected = Index(["a", nulls_fixture, "b", "c"], dtype=object) + result = Index(list("abc"), dtype=object).insert( + 1, Index([nulls_fixture], dtype=object) + ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/categorical/test_astype.py b/pandas/tests/indexes/categorical/test_astype.py index da1d692f9eb2d..a17627b7515b2 100644 --- a/pandas/tests/indexes/categorical/test_astype.py +++ b/pandas/tests/indexes/categorical/test_astype.py @@ -18,7 +18,7 @@ def test_astype(self): ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) result = ci.astype(object) - tm.assert_index_equal(result, Index(np.array(ci))) + tm.assert_index_equal(result, Index(np.array(ci), dtype=object)) # this IS equal, but not the same class assert result.equals(ci) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 7af4f6809ec64..aa9073ed789bb 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import index as libindex from pandas._libs.arrays import NDArrayBacked @@ -47,7 +49,7 @@ def test_insert(self, simple_index): # invalid -> cast to object expected = ci.astype(object).insert(0, "d") - result = ci.insert(0, "d") + result = ci.insert(0, "d").astype(object) tm.assert_index_equal(result, expected, exact=True) # GH 18295 (test missing) @@ -194,6 +196,7 @@ def test_unique(self, data, categories, expected_data, ordered): expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr doesn't roundtrip") def test_repr_roundtrip(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index ea3e4ce213e67..522ca1bc2afde 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -1,6 +1,9 @@ """ Tests for CategoricalIndex.__repr__ and related methods. """ +import pytest + +from pandas._config import using_pyarrow_string_dtype import pandas._config.config as cf from pandas import CategoricalIndex @@ -16,6 +19,7 @@ def test_format_different_scalar_lengths(self): with tm.assert_produces_warning(FutureWarning, match=msg): assert idx.format() == expected + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="repr different") def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) diff --git a/pandas/tests/indexes/categorical/test_reindex.py b/pandas/tests/indexes/categorical/test_reindex.py index 8ca5c6099b4e7..5b1f2b9fb159a 100644 --- a/pandas/tests/indexes/categorical/test_reindex.py +++ b/pandas/tests/indexes/categorical/test_reindex.py @@ -40,7 +40,7 @@ def test_reindex_duplicate_target(self): # See GH25459 cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) res, indexer = cat.reindex(["a", "c", "c"]) - exp = Index(["a", "c", "c"], dtype="object") + exp = Index(["a", "c", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) diff --git a/pandas/tests/indexes/datetimes/methods/test_map.py b/pandas/tests/indexes/datetimes/methods/test_map.py index c31e2407190ea..f35f07bd32068 100644 --- a/pandas/tests/indexes/datetimes/methods/test_map.py +++ b/pandas/tests/indexes/datetimes/methods/test_map.py @@ -16,7 +16,7 @@ def test_map(self): f = lambda x: x.strftime("%Y%m%d") result = rng.map(f) - exp = Index([f(x) for x in rng], dtype=" Date: Tue, 21 Nov 2023 21:17:28 +0100 Subject: [PATCH 22/30] Update --- doc/source/whatsnew/v2.1.3.rst | 1 - doc/source/whatsnew/v2.1.4.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index a7f20aa67685f..af626895a9e0e 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -21,7 +21,6 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) - Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 25afcbb3bb532..e52c42dd31211 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :class:`Series` constructor raising DeprecationWarning when ``index`` is a list of :class:`Series` (:issue:`55228`) +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - .. --------------------------------------------------------------------------- From 02adc20e0810b0b4b555d24671cf289000a04f7a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 21:42:34 +0100 Subject: [PATCH 23/30] Update test_base.py --- pandas/tests/indexes/test_base.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9e009bb6c08cd..eb01f75b35cea 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1254,18 +1254,10 @@ def test_equals_op_mismatched_multiindex_raises(self, index): def test_equals_op_index_vs_mi_same_length(self, using_infer_string): mi = MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]) index = Index(["foo", "bar", "baz"]) - if using_infer_string: - import pyarrow as pa - with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): - with tm.assert_produces_warning( - DeprecationWarning, check_stacklevel=False - ): - mi == index - else: - result = mi == index - expected = np.array([False, False, False]) - tm.assert_numpy_array_equal(result, expected) + result = mi == index + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( "dt_conv, arg", From cf283691a0b019d280bc613e24d281cdd241822a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 21:44:51 +0100 Subject: [PATCH 24/30] Update test_old_base.py --- pandas/tests/indexes/test_old_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 8aadb829ffc9b..1d5c414e70dd7 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.core.dtypes.common import ( @@ -415,7 +417,7 @@ def test_insert_base(self, index): # test 0th element assert index[0:4].equals(result.insert(0, index[0])) - @pytest.mark.skip() + @pytest.mark.skip(using_pyarrow_string_dtype(), reason="completely different behavior, tested elsewher") def test_insert_out_of_bounds(self, index): # TypeError/IndexError matches what np.insert raises in these cases From 94cf90855952ef183fc380964658b2f7dc9fd9d6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 21:45:29 +0100 Subject: [PATCH 25/30] Update conftest.py --- pandas/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 64d1117b96df7..25a3383c97b84 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1926,7 +1926,7 @@ def using_copy_on_write() -> bool: @pytest.fixture def using_infer_string() -> bool: """ - Fixture to check if Copy-on-Write is enabled. + Fixture to check if infer string option is enabled. """ return pd.options.future.infer_string is True @@ -1934,7 +1934,7 @@ def using_infer_string() -> bool: @pytest.fixture def warn_copy_on_write() -> bool: """ - Fixture to check if Copy-on-Write is enabled. + Fixture to check if Copy-on-Write is in warning mode. """ return ( pd.options.mode.copy_on_write == "warn" From 671a3535cb095573694759508209e088979db99c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 21:46:11 +0100 Subject: [PATCH 26/30] Update conftest.py --- pandas/conftest.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 25a3383c97b84..798d485a03cca 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1923,14 +1923,6 @@ def using_copy_on_write() -> bool: ) -@pytest.fixture -def using_infer_string() -> bool: - """ - Fixture to check if infer string option is enabled. - """ - return pd.options.future.infer_string is True - - @pytest.fixture def warn_copy_on_write() -> bool: """ @@ -1942,6 +1934,14 @@ def warn_copy_on_write() -> bool: ) +@pytest.fixture +def using_infer_string() -> bool: + """ + Fixture to check if infer string option is enabled. + """ + return pd.options.future.infer_string is True + + warsaws = ["Europe/Warsaw", "dateutil/Europe/Warsaw"] if zoneinfo is not None: warsaws.append(zoneinfo.ZoneInfo("Europe/Warsaw")) # type: ignore[arg-type] From 77f4711516d10905deed064daa8a7f622287b7c7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 22:25:29 +0100 Subject: [PATCH 27/30] Update test_old_base.py --- pandas/tests/indexes/test_old_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 1d5c414e70dd7..e4fae1119e121 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -417,7 +417,7 @@ def test_insert_base(self, index): # test 0th element assert index[0:4].equals(result.insert(0, index[0])) - @pytest.mark.skip(using_pyarrow_string_dtype(), reason="completely different behavior, tested elsewher") + @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="completely different behavior, tested elsewher") def test_insert_out_of_bounds(self, index): # TypeError/IndexError matches what np.insert raises in these cases From aa1a6cf37b5d84f68fdfe4338cc4d5566dd7997c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 22:28:55 +0100 Subject: [PATCH 28/30] Update --- pandas/tests/indexes/multi/test_setops.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index a447e6dbc4c9c..5238ca5cca1a3 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -760,8 +760,14 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): def test_union_with_na_when_constructing_dataframe(): # GH43222 - series1 = Series((1,), index=MultiIndex.from_tuples(((None, None),))) + pd.options.future.infer_string = True + series1 = Series( + (1,), + index=MultiIndex.from_arrays( + [Series([None], dtype="string"), Series([None], dtype="string")] + ), + ) series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) result = DataFrame([series1, series2]) expected = DataFrame({(np.nan, np.nan): [1.0, 10.0], ("a", "b"): [np.nan, 20.0]}) - tm.assert_frame_equal(result, expected, check_column_type=False) + tm.assert_frame_equal(result, expected) From bd5e3052d5b05cb005d0100af7777ddd68fe9e76 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 30 Nov 2023 23:05:55 +0100 Subject: [PATCH 29/30] Update test_setops.py --- pandas/tests/indexes/multi/test_setops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 5238ca5cca1a3..0abb56ecf9de7 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -760,7 +760,6 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): def test_union_with_na_when_constructing_dataframe(): # GH43222 - pd.options.future.infer_string = True series1 = Series( (1,), index=MultiIndex.from_arrays( From 746b50432915a3bac889e87585e839169cf3cc5c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 8 Dec 2023 22:48:51 +0100 Subject: [PATCH 30/30] Fix pre-commit --- pandas/tests/indexes/test_old_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index e4fae1119e121..528ef2e19aaec 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -417,7 +417,10 @@ def test_insert_base(self, index): # test 0th element assert index[0:4].equals(result.insert(0, index[0])) - @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="completely different behavior, tested elsewher") + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="completely different behavior, tested elsewher", + ) def test_insert_out_of_bounds(self, index): # TypeError/IndexError matches what np.insert raises in these cases