From afc8eddfa9066b8caee46c19763516094de0e79c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 03:07:08 +0100 Subject: [PATCH 1/4] Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 (#60716) * Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 * update pypi index * extra filterwarnings * more test updates * temp enable infer_string option * Adapt test_get_handle_pyarrow_compat for pyarrow 19 * Use pa_version_under19p0 in test_get_handle_pyarrow_compat * Adjust test_string_inference for using_infer_string * Fix test_string_inference for feather --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 5efac8250787414ec580f0472e2b563032ec7d53) --- .github/workflows/unit-tests.yml | 1 + ci/deps/actions-311-pyarrownightly.yaml | 2 +- pandas/compat/__init__.py | 18 +----- pandas/compat/pyarrow.py | 4 +- pandas/io/_util.py | 10 +++- pandas/tests/arrays/string_/test_string.py | 22 +++++++- pandas/tests/io/test_common.py | 5 +- pandas/tests/io/test_feather.py | 18 +++++- pandas/tests/io/test_parquet.py | 65 ++++++++++++++-------- 9 files changed, 97 insertions(+), 48 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 4a2c412d2d98e..39f852f6165f9 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -105,6 +105,7 @@ jobs: - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index ba655f9690af6..40b936472d409 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -25,7 +25,7 @@ dependencies: - pip: - "tzdata>=2022.7" - - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" - "--prefer-binary" - "--pre" - "pyarrow" diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5e82853109015..3361895ee8889 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,18 +14,9 @@ import sys from typing import TYPE_CHECKING -from pandas.compat._constants import ( - IS64, - ISMUSL, - PY310, - PY311, - PY312, - PYPY, -) import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( - HAS_PYARROW, pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, @@ -34,6 +25,7 @@ pa_version_under16p0, pa_version_under17p0, pa_version_under18p0, + pa_version_under19p0, ) if TYPE_CHECKING: @@ -193,11 +185,5 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under16p0", "pa_version_under17p0", "pa_version_under18p0", - "HAS_PYARROW", - "IS64", - "ISMUSL", - "PY310", - "PY311", - "PY312", - "PYPY", + "pa_version_under19p0", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index f579b8a45d386..81a2d0dc80a10 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -18,6 +18,7 @@ pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") pa_version_under18p0 = _palv < Version("18.0.0") + pa_version_under19p0 = _palv < Version("19.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -29,5 +30,6 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True - pa_version_under18p0 = False + pa_version_under18p0 = True + pa_version_under19p0 = True HAS_PYARROW = False diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 9373888e28d28..35fdfb1a9ee82 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -10,7 +10,10 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import pa_version_under18p0 +from pandas.compat import ( + pa_version_under18p0, + pa_version_under19p0, +) from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -78,7 +81,10 @@ def arrow_table_to_pandas( elif dtype_backend == "pyarrow": types_mapper = pd.ArrowDtype elif using_string_dtype(): - types_mapper = _arrow_string_types_mapper() + if pa_version_under19p0: + types_mapper = _arrow_string_types_mapper() + else: + types_mapper = None elif dtype_backend is lib.no_default or dtype_backend == "numpy": types_mapper = None else: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 14c02723191a8..c7f854c11f3dd 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,7 +9,10 @@ from pandas._config import using_string_dtype -from pandas.compat.pyarrow import pa_version_under12p0 +from pandas.compat.pyarrow import ( + pa_version_under12p0, + pa_version_under19p0, +) from pandas.core.dtypes.common import is_dtype_equal @@ -541,7 +544,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert table.field("a").type == "large_string" with pd.option_context("string_storage", string_storage): result = table.to_pandas() - if dtype.na_value is np.nan and not using_string_dtype(): + if dtype.na_value is np.nan and not using_infer_string: assert result["a"].dtype == "object" else: assert isinstance(result["a"].dtype, pd.StringDtype) @@ -555,6 +558,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string): assert result.loc[2, "a"] is result["a"].dtype.na_value +@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") +def test_arrow_from_string(using_infer_string): + # not roundtrip, but starting with pyarrow table without pandas metadata + pa = pytest.importorskip("pyarrow") + table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())}) + + result = table.to_pandas() + + if using_infer_string and not pa_version_under19p0: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str") + else: + expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object") + tm.assert_frame_equal(result, expected) + + @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a0dd64f1cb82b..a815ba9c1650a 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,6 +19,7 @@ import pytest from pandas.compat import is_platform_windows +from pandas.compat.pyarrow import pa_version_under19p0 import pandas.util._test_decorators as td import pandas as pd @@ -166,8 +167,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() - # TODO will have to update this when pyarrow' to_pandas() is fixed - expected = expected.astype("object") + if pa_version_under19p0: + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 58a5f78ce3258..a0c76cd6ffcb5 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,7 +2,10 @@ import numpy as np import pytest -from pandas.compat.pyarrow import pa_version_under18p0 +from pandas.compat.pyarrow import ( + pa_version_under18p0, + pa_version_under19p0, +) import pandas as pd import pandas._testing as tm @@ -241,16 +244,27 @@ def test_invalid_dtype_backend(self): with pytest.raises(ValueError, match=msg): read_feather(path, dtype_backend="numpy") - def test_string_inference(self, tmp_path): + def test_string_inference(self, tmp_path, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}) df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) + expected = pd.DataFrame( + data={"a": ["x", "y"]}, + dtype=dtype, + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), + ) tm.assert_frame_equal(result, expected) @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 87f9b0108402c..03bf1d2d7246d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -19,6 +19,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under15p0, + pa_version_under19p0, ) import pandas as pd @@ -261,8 +262,10 @@ def test_invalid_engine(df_compat): check_round_trip(df_compat, "foo", "bar") -def test_options_py(df_compat, pa): +def test_options_py(df_compat, pa, using_infer_string): # use the set option + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) @@ -798,18 +801,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame() - df["a"] = pd.Categorical(list("abcdef")) - - # test for null, out-of-order values, and unobserved category - df["b"] = pd.Categorical( - ["bar", "foo", "foo", "bar", None, "bar"], - dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), - ) - - # test for ordered flag - df["c"] = pd.Categorical( - ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True + df = pd.DataFrame( + { + "a": pd.Categorical(list("abcdef")), + # test for null, out-of-order values, and unobserved category + "b": pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ), + # test for ordered flag + "c": pd.Categorical( + ["a", "b", "c", "a", "c", "b"], + categories=["b", "c", "d"], + ordered=True, + ), + } ) check_round_trip(df, pa) @@ -878,11 +884,13 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - def test_read_file_like_obj_support(self, df_compat): + def test_read_file_like_obj_support(self, df_compat, using_infer_string): pytest.importorskip("pyarrow") buffer = BytesIO() df_compat.to_parquet(buffer) df_from_buf = read_parquet(buffer) + if using_infer_string and not pa_version_under19p0: + df_compat.columns = df_compat.columns.astype("str") tm.assert_frame_equal(df_compat, df_from_buf) def test_expand_user(self, df_compat, monkeypatch): @@ -949,7 +957,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - if using_infer_string: + if using_infer_string and pa_version_under19p0: check_round_trip(df, pa, expected=df.astype({"c": "str"})) else: check_round_trip(df, pa) @@ -963,7 +971,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): if using_infer_string: - expected = df.astype("str") + if pa_version_under19p0: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") expected.columns = expected.columns.astype("str") else: expected = df.astype(f"string[{string_storage}]") @@ -1128,17 +1139,24 @@ def test_df_attrs_persistence(self, tmp_path, pa): new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs - def test_string_inference(self, tmp_path, pa): + def test_string_inference(self, tmp_path, pa, using_infer_string): # GH#54431 path = tmp_path / "test_string_inference.p" df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"]) - df.to_parquet(path, engine="pyarrow") + df.to_parquet(path, engine=pa) with pd.option_context("future.infer_string", True): - result = read_parquet(path, engine="pyarrow") + result = read_parquet(path, engine=pa) + dtype = pd.StringDtype(na_value=np.nan) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype=pd.StringDtype(na_value=np.nan), - index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + dtype=dtype, + index=pd.Index(["a", "b"], dtype=dtype), + columns=pd.Index( + ["a"], + dtype=object + if pa_version_under19p0 and not using_infer_string + else dtype, + ), ) tm.assert_frame_equal(result, expected) @@ -1151,7 +1169,10 @@ def test_roundtrip_decimal(self, tmp_path, pa): df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]") df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))])) result = read_parquet(path) - expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + if pa_version_under19p0: + expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]") + else: + expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object") tm.assert_frame_equal(result, expected) def test_infer_string_large_string_type(self, tmp_path, pa): From 2df96bb096f428a2458d87b1e06be4816a01a412 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 11:32:45 +0100 Subject: [PATCH 2/4] fixup --- pandas/compat/__init__.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3361895ee8889..9b6b1ab3b8909 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,9 +14,18 @@ import sys from typing import TYPE_CHECKING +from pandas.compat._constants import ( + IS64, + ISMUSL, + PY310, + PY311, + PY312, + PYPY, +) import pandas.compat.compressors from pandas.compat.numpy import is_numpy_dev from pandas.compat.pyarrow import ( + HAS_PYARROW, pa_version_under10p1, pa_version_under11p0, pa_version_under13p0, @@ -186,4 +195,11 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under17p0", "pa_version_under18p0", "pa_version_under19p0", + "HAS_PYARROW", + "IS64", + "ISMUSL", + "PY310", + "PY311", + "PY312", + "PYPY", ] From a2e6fb7485c933a90cae8bcaea0f71115dd4d441 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 14:20:39 +0100 Subject: [PATCH 3/4] don't hardcode object dtype --- pandas/tests/io/test_feather.py | 8 ++++---- pandas/tests/io/test_parquet.py | 5 +---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a0c76cd6ffcb5..0ab23e3b51a03 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -136,8 +136,8 @@ def test_rw_use_threads(self): def test_path_pathlib(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_pathlib(df.to_feather, read_feather) tm.assert_frame_equal(df, result) @@ -145,8 +145,8 @@ def test_path_pathlib(self): def test_path_localpath(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() result = tm.round_trip_localpath(df.to_feather, read_feather) tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 03bf1d2d7246d..f66ee7dc4367e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -111,10 +111,7 @@ def fp(request): @pytest.fixture def df_compat(): - # TODO(infer_string) should this give str columns? - return pd.DataFrame( - {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) - ) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"])) @pytest.fixture From 25de7494f3e86b85f24268575d70188a1a87b838 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Jan 2025 14:22:31 +0100 Subject: [PATCH 4/4] also enable CoW when enabling future.infer_string --- .github/workflows/unit-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 39f852f6165f9..210852d0cd809 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -106,6 +106,7 @@ jobs: env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" pandas_future_infer_string: "1" + pandas_copy_on_write: "1" fail-fast: false name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} env: