From d4961254ce5f65cad3addcbd63b326e73f0fbece Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 29 Dec 2024 09:00:31 -0500 Subject: [PATCH 1/3] TST(string dtype): Resolve HDF5 xfails --- pandas/io/pytables.py | 2 ++ pandas/tests/io/pytables/test_subclass.py | 3 --- pandas/tests/io/test_common.py | 3 --- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 7d265bc430125..b75dc6c3a43b4 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5297,6 +5297,8 @@ def _dtype_to_kind(dtype_str: str) -> str: kind = "integer" elif dtype_str == "object": kind = "object" + elif dtype_str == "str": + kind = "str" else: raise ValueError(f"cannot interpret dtype of [{dtype_str}]") diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py index bbe1cd77e0d9f..03622faa2b5a8 100644 --- a/pandas/tests/io/pytables/test_subclass.py +++ b/pandas/tests/io/pytables/test_subclass.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Series, @@ -19,7 +17,6 @@ class TestHDFStoreSubclass: # GH 33748 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_supported_for_subclass_dataframe(self, tmp_path): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 70422a0ea6edc..7ff3d24336f00 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -19,8 +19,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( WASM, is_platform_windows, @@ -365,7 +363,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll From 335ead5f3db4a8bdd13a4447b5fcaf67d919b2c4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 29 Dec 2024 10:59:37 -0500 Subject: [PATCH 2/3] More xfails --- .../tests/io/pytables/test_file_handling.py | 69 +++++++++++++++---- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 606b19ac0ed75..1e868497863df 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -17,6 +17,7 @@ PossibleDataLossError, ) +import pandas as pd from pandas import ( DataFrame, HDFStore, @@ -35,14 +36,9 @@ from pandas.io import pytables from pandas.io.pytables import Term -pytestmark = [ - pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] - @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) -def test_mode(setup_path, tmp_path, mode): +def test_mode(setup_path, tmp_path, mode, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -91,10 +87,14 @@ def test_mode(setup_path, tmp_path, mode): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) + if using_infer_string: + df.columns = df.columns.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) tm.assert_frame_equal(result, df) -def test_default_mode(tmp_path, setup_path): +def test_default_mode(tmp_path, setup_path, using_infer_string): # read_hdf uses default mode df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -104,6 +104,10 @@ def test_default_mode(tmp_path, setup_path): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") + if using_infer_string: + df.columns = df.columns.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) tm.assert_frame_equal(result, df) @@ -163,7 +167,7 @@ def test_reopen_handle(tmp_path, setup_path): assert not store.is_open -def test_open_args(setup_path): +def test_open_args(setup_path, using_infer_string): with tm.ensure_clean(setup_path) as path: df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -178,8 +182,17 @@ def test_open_args(setup_path): store["df"] = df store.append("df2", df) - tm.assert_frame_equal(store["df"], df) - tm.assert_frame_equal(store["df2"], df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + expected.columns = expected.columns.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + + tm.assert_frame_equal(store["df"], expected) + tm.assert_frame_equal(store["df2"], expected) store.close() @@ -194,7 +207,7 @@ def test_flush(setup_path): store.flush(fsync=True) -def test_complibs_default_settings(tmp_path, setup_path): +def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): # GH15943 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -207,7 +220,15 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complevel=9) result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + expected.columns = expected.columns.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -218,7 +239,15 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df", complib="zlib") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + expected.columns = expected.columns.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -229,7 +258,15 @@ def test_complibs_default_settings(tmp_path, setup_path): tmpfile = tmp_path / setup_path df.to_hdf(tmpfile, key="df") result = read_hdf(tmpfile, "df") - tm.assert_frame_equal(result, df) + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + expected.columns = expected.columns.astype( + pd.StringDtype(storage="pyarrow", na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): @@ -308,6 +345,7 @@ def test_complibs(tmp_path, lvl, lib, request): assert node.filters.complib == lib +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) @@ -325,6 +363,7 @@ def test_encoding(setup_path): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "val", [ @@ -340,7 +379,7 @@ def test_encoding(setup_path): ], ) @pytest.mark.parametrize("dtype", ["category", object]) -def test_latin_encoding(tmp_path, setup_path, dtype, val): +def test_latin_encoding(tmp_path, setup_path, dtype, val, using_infer_string): enc = "latin-1" nan_rep = "" key = "data" From e5e24e8a5457d541ae7691a1b8cc7d9812272af6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 29 Dec 2024 11:27:41 -0500 Subject: [PATCH 3/3] Cleanup --- .../tests/io/pytables/test_file_handling.py | 50 +++++++------------ 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py index 1e868497863df..16c3c6798ff76 100644 --- a/pandas/tests/io/pytables/test_file_handling.py +++ b/pandas/tests/io/pytables/test_file_handling.py @@ -17,7 +17,6 @@ PossibleDataLossError, ) -import pandas as pd from pandas import ( DataFrame, HDFStore, @@ -36,6 +35,10 @@ from pandas.io import pytables from pandas.io.pytables import Term +pytestmark = [ + pytest.mark.single_cpu, +] + @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"]) def test_mode(setup_path, tmp_path, mode, using_infer_string): @@ -88,9 +91,7 @@ def test_mode(setup_path, tmp_path, mode, using_infer_string): else: result = read_hdf(path, "df", mode=mode) if using_infer_string: - df.columns = df.columns.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) + df.columns = df.columns.astype("str") tm.assert_frame_equal(result, df) @@ -104,11 +105,10 @@ def test_default_mode(tmp_path, setup_path, using_infer_string): path = tmp_path / setup_path df.to_hdf(path, key="df", mode="w") result = read_hdf(path, "df") + expected = df.copy() if using_infer_string: - df.columns = df.columns.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) - tm.assert_frame_equal(result, df) + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(result, expected) def test_reopen_handle(tmp_path, setup_path): @@ -184,12 +184,8 @@ def test_open_args(setup_path, using_infer_string): expected = df.copy() if using_infer_string: - expected.index = expected.index.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) - expected.columns = expected.columns.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") tm.assert_frame_equal(store["df"], expected) tm.assert_frame_equal(store["df2"], expected) @@ -222,12 +218,8 @@ def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): result = read_hdf(tmpfile, "df") expected = df.copy() if using_infer_string: - expected.index = expected.index.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) - expected.columns = expected.columns.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: @@ -241,12 +233,8 @@ def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): result = read_hdf(tmpfile, "df") expected = df.copy() if using_infer_string: - expected.index = expected.index.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) - expected.columns = expected.columns.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: @@ -260,12 +248,8 @@ def test_complibs_default_settings(tmp_path, setup_path, using_infer_string): result = read_hdf(tmpfile, "df") expected = df.copy() if using_infer_string: - expected.index = expected.index.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) - expected.columns = expected.columns.astype( - pd.StringDtype(storage="pyarrow", na_value=np.nan) - ) + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") tm.assert_frame_equal(result, expected) with tables.open_file(tmpfile, mode="r") as h5file: @@ -379,7 +363,7 @@ def test_encoding(setup_path): ], ) @pytest.mark.parametrize("dtype", ["category", object]) -def test_latin_encoding(tmp_path, setup_path, dtype, val, using_infer_string): +def test_latin_encoding(tmp_path, setup_path, dtype, val): enc = "latin-1" nan_rep = "" key = "data"