From d4961254ce5f65cad3addcbd63b326e73f0fbece Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 29 Dec 2024 09:00:31 -0500
Subject: [PATCH 1/3] TST(string dtype): Resolve HDF5 xfails

---
 pandas/io/pytables.py                     | 2 ++
 pandas/tests/io/pytables/test_subclass.py | 3 ---
 pandas/tests/io/test_common.py            | 3 ---
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 7d265bc430125..b75dc6c3a43b4 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -5297,6 +5297,8 @@ def _dtype_to_kind(dtype_str: str) -> str:
         kind = "integer"
     elif dtype_str == "object":
         kind = "object"
+    elif dtype_str == "str":
+        kind = "str"
     else:
         raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
 
diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py
index bbe1cd77e0d9f..03622faa2b5a8 100644
--- a/pandas/tests/io/pytables/test_subclass.py
+++ b/pandas/tests/io/pytables/test_subclass.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Series,
@@ -19,7 +17,6 @@
 
 class TestHDFStoreSubclass:
     # GH 33748
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_supported_for_subclass_dataframe(self, tmp_path):
         data = {"a": [1, 2], "b": [3, 4]}
         sdf = tm.SubclassedDataFrame(data, dtype=np.intp)
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index 70422a0ea6edc..7ff3d24336f00 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -19,8 +19,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import (
     WASM,
     is_platform_windows,
@@ -365,7 +363,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module):
                     expected = f_path.read()
                     assert result == expected
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support")
     def test_write_fspath_hdf5(self):
         # Same test as write_fspath_all, except HDF5 files aren't
         # necessarily byte-for-byte identical for a given dataframe, so we'll

From 335ead5f3db4a8bdd13a4447b5fcaf67d919b2c4 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 29 Dec 2024 10:59:37 -0500
Subject: [PATCH 2/3] More xfails

---
 .../tests/io/pytables/test_file_handling.py   | 69 +++++++++++++++----
 1 file changed, 54 insertions(+), 15 deletions(-)

diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py
index 606b19ac0ed75..1e868497863df 100644
--- a/pandas/tests/io/pytables/test_file_handling.py
+++ b/pandas/tests/io/pytables/test_file_handling.py
@@ -17,6 +17,7 @@
     PossibleDataLossError,
 )
 
+import pandas as pd
 from pandas import (
     DataFrame,
     HDFStore,
@@ -35,14 +36,9 @@
 from pandas.io import pytables
 from pandas.io.pytables import Term
 
-pytestmark = [
-    pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
-
 
 @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
-def test_mode(setup_path, tmp_path, mode):
+def test_mode(setup_path, tmp_path, mode, using_infer_string):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
         columns=Index(list("ABCD"), dtype=object),
@@ -91,10 +87,14 @@ def test_mode(setup_path, tmp_path, mode):
             read_hdf(path, "df", mode=mode)
     else:
         result = read_hdf(path, "df", mode=mode)
+        if using_infer_string:
+            df.columns = df.columns.astype(
+                pd.StringDtype(storage="pyarrow", na_value=np.nan)
+            )
         tm.assert_frame_equal(result, df)
 
 
-def test_default_mode(tmp_path, setup_path):
+def test_default_mode(tmp_path, setup_path, using_infer_string):
     # read_hdf uses default mode
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
@@ -104,6 +104,10 @@ def test_default_mode(tmp_path, setup_path):
     path = tmp_path / setup_path
     df.to_hdf(path, key="df", mode="w")
     result = read_hdf(path, "df")
+    if using_infer_string:
+        df.columns = df.columns.astype(
+            pd.StringDtype(storage="pyarrow", na_value=np.nan)
+        )
     tm.assert_frame_equal(result, df)
 
 
@@ -163,7 +167,7 @@ def test_reopen_handle(tmp_path, setup_path):
     assert not store.is_open
 
 
-def test_open_args(setup_path):
+def test_open_args(setup_path, using_infer_string):
     with tm.ensure_clean(setup_path) as path:
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
@@ -178,8 +182,17 @@ def test_open_args(setup_path):
         store["df"] = df
         store.append("df2", df)
 
-        tm.assert_frame_equal(store["df"], df)
-        tm.assert_frame_equal(store["df2"], df)
+        expected = df.copy()
+        if using_infer_string:
+            expected.index = expected.index.astype(
+                pd.StringDtype(storage="pyarrow", na_value=np.nan)
+            )
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(storage="pyarrow", na_value=np.nan)
+            )
+
+        tm.assert_frame_equal(store["df"], expected)
+        tm.assert_frame_equal(store["df2"], expected)
 
         store.close()
 
@@ -194,7 +207,7 @@ def test_flush(setup_path):
         store.flush(fsync=True)
 
 
-def test_complibs_default_settings(tmp_path, setup_path):
+def test_complibs_default_settings(tmp_path, setup_path, using_infer_string):
     # GH15943
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
@@ -207,7 +220,15 @@ def test_complibs_default_settings(tmp_path, setup_path):
     tmpfile = tmp_path / setup_path
     df.to_hdf(tmpfile, key="df", complevel=9)
     result = read_hdf(tmpfile, "df")
-    tm.assert_frame_equal(result, df)
+    expected = df.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype(
+            pd.StringDtype(storage="pyarrow", na_value=np.nan)
+        )
+        expected.columns = expected.columns.astype(
+            pd.StringDtype(storage="pyarrow", na_value=np.nan)
+        )
+    tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
         for node in h5file.walk_nodes(where="/df", classname="Leaf"):
@@ -218,7 +239,15 @@ def test_complibs_default_settings(tmp_path, setup_path):
     tmpfile = tmp_path / setup_path
     df.to_hdf(tmpfile, key="df", complib="zlib")
     result = read_hdf(tmpfile, "df")
-    tm.assert_frame_equal(result, df)
+    expected = df.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype(
+            pd.StringDtype(storage="pyarrow", na_value=np.nan)
+        )
+        expected.columns = expected.columns.astype(
+            pd.StringDtype(storage="pyarrow", na_value=np.nan)
+        )
+    tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
         for node in h5file.walk_nodes(where="/df", classname="Leaf"):
@@ -229,7 +258,15 @@ def test_complibs_default_settings(tmp_path, setup_path):
     tmpfile = tmp_path / setup_path
     df.to_hdf(tmpfile, key="df")
     result = read_hdf(tmpfile, "df")
-    tm.assert_frame_equal(result, df)
+    expected = df.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype(
+            pd.StringDtype(storage="pyarrow", na_value=np.nan)
+        )
+        expected.columns = expected.columns.astype(
+            pd.StringDtype(storage="pyarrow", na_value=np.nan)
+        )
+    tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
         for node in h5file.walk_nodes(where="/df", classname="Leaf"):
@@ -308,6 +345,7 @@ def test_complibs(tmp_path, lvl, lib, request):
                 assert node.filters.complib == lib
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.skipif(
     not is_platform_little_endian(), reason="reason platform is not little endian"
 )
@@ -325,6 +363,7 @@ def test_encoding(setup_path):
         tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "val",
     [
@@ -340,7 +379,7 @@ def test_encoding(setup_path):
     ],
 )
 @pytest.mark.parametrize("dtype", ["category", object])
-def test_latin_encoding(tmp_path, setup_path, dtype, val):
+def test_latin_encoding(tmp_path, setup_path, dtype, val, using_infer_string):
     enc = "latin-1"
     nan_rep = ""
     key = "data"

From e5e24e8a5457d541ae7691a1b8cc7d9812272af6 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Sun, 29 Dec 2024 11:27:41 -0500
Subject: [PATCH 3/3] Cleanup

---
 .../tests/io/pytables/test_file_handling.py   | 50 +++++++------------
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py
index 1e868497863df..16c3c6798ff76 100644
--- a/pandas/tests/io/pytables/test_file_handling.py
+++ b/pandas/tests/io/pytables/test_file_handling.py
@@ -17,7 +17,6 @@
     PossibleDataLossError,
 )
 
-import pandas as pd
 from pandas import (
     DataFrame,
     HDFStore,
@@ -36,6 +35,10 @@
 from pandas.io import pytables
 from pandas.io.pytables import Term
 
+pytestmark = [
+    pytest.mark.single_cpu,
+]
+
 
 @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
 def test_mode(setup_path, tmp_path, mode, using_infer_string):
@@ -88,9 +91,7 @@ def test_mode(setup_path, tmp_path, mode, using_infer_string):
     else:
         result = read_hdf(path, "df", mode=mode)
         if using_infer_string:
-            df.columns = df.columns.astype(
-                pd.StringDtype(storage="pyarrow", na_value=np.nan)
-            )
+            df.columns = df.columns.astype("str")
         tm.assert_frame_equal(result, df)
 
 
@@ -104,11 +105,10 @@ def test_default_mode(tmp_path, setup_path, using_infer_string):
     path = tmp_path / setup_path
     df.to_hdf(path, key="df", mode="w")
     result = read_hdf(path, "df")
+    expected = df.copy()
     if using_infer_string:
-        df.columns = df.columns.astype(
-            pd.StringDtype(storage="pyarrow", na_value=np.nan)
-        )
-    tm.assert_frame_equal(result, df)
+        expected.columns = expected.columns.astype("str")
+    tm.assert_frame_equal(result, expected)
 
 
 def test_reopen_handle(tmp_path, setup_path):
@@ -184,12 +184,8 @@ def test_open_args(setup_path, using_infer_string):
 
         expected = df.copy()
         if using_infer_string:
-            expected.index = expected.index.astype(
-                pd.StringDtype(storage="pyarrow", na_value=np.nan)
-            )
-            expected.columns = expected.columns.astype(
-                pd.StringDtype(storage="pyarrow", na_value=np.nan)
-            )
+            expected.index = expected.index.astype("str")
+            expected.columns = expected.columns.astype("str")
 
         tm.assert_frame_equal(store["df"], expected)
         tm.assert_frame_equal(store["df2"], expected)
@@ -222,12 +218,8 @@ def test_complibs_default_settings(tmp_path, setup_path, using_infer_string):
     result = read_hdf(tmpfile, "df")
     expected = df.copy()
     if using_infer_string:
-        expected.index = expected.index.astype(
-            pd.StringDtype(storage="pyarrow", na_value=np.nan)
-        )
-        expected.columns = expected.columns.astype(
-            pd.StringDtype(storage="pyarrow", na_value=np.nan)
-        )
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
     tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
@@ -241,12 +233,8 @@ def test_complibs_default_settings(tmp_path, setup_path, using_infer_string):
     result = read_hdf(tmpfile, "df")
     expected = df.copy()
     if using_infer_string:
-        expected.index = expected.index.astype(
-            pd.StringDtype(storage="pyarrow", na_value=np.nan)
-        )
-        expected.columns = expected.columns.astype(
-            pd.StringDtype(storage="pyarrow", na_value=np.nan)
-        )
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
     tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
@@ -260,12 +248,8 @@ def test_complibs_default_settings(tmp_path, setup_path, using_infer_string):
     result = read_hdf(tmpfile, "df")
     expected = df.copy()
     if using_infer_string:
-        expected.index = expected.index.astype(
-            pd.StringDtype(storage="pyarrow", na_value=np.nan)
-        )
-        expected.columns = expected.columns.astype(
-            pd.StringDtype(storage="pyarrow", na_value=np.nan)
-        )
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
     tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
@@ -379,7 +363,7 @@ def test_encoding(setup_path):
     ],
 )
 @pytest.mark.parametrize("dtype", ["category", object])
-def test_latin_encoding(tmp_path, setup_path, dtype, val, using_infer_string):
+def test_latin_encoding(tmp_path, setup_path, dtype, val):
     enc = "latin-1"
     nan_rep = ""
     key = "data"