diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index b31f7c1166dc0..448779e4c3e0b 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -36,8 +36,10 @@ Bug fixes - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) + **Experimental dtypes** +- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). - Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 9702eb4615909..d2f0b2ffbaeec 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1005,7 +1005,7 @@ _TYPE_MAP = { 'complex64': 'complex', 'complex128': 'complex', 'c': 'complex', - 'string': 'bytes', + 'string': 'string', 'S': 'bytes', 'U': 'string', 'bool': 'boolean', diff --git a/pandas/conftest.py b/pandas/conftest.py index 7463b2b579c0c..821bec19d6115 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -744,6 +744,7 @@ def any_numpy_dtype(request): # categoricals are handled separately _any_skipna_inferred_dtype = [ ("string", ["a", np.nan, "c"]), + ("string", ["a", pd.NA, "c"]), ("bytes", [b"a", np.nan, b"c"]), ("empty", [np.nan, np.nan, np.nan]), ("empty", []), @@ -754,6 +755,7 @@ def any_numpy_dtype(request): ("mixed-integer-float", [1, np.nan, 2.0]), ("decimal", [Decimal(1), np.nan, Decimal(2)]), ("boolean", [True, np.nan, False]), + ("boolean", [True, pd.NA, False]), ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 48f9262ad3486..48ae1f67297af 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1200,6 +1200,24 @@ def test_interval(self): inferred = lib.infer_dtype(pd.Series(idx), skipna=False) assert inferred == "interval" + @pytest.mark.parametrize("klass", [pd.array, pd.Series]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]]) + def test_string_dtype(self, data, skipna, klass): + # StringArray + val = klass(data, dtype="string") + inferred = lib.infer_dtype(val, skipna=skipna) + assert inferred == "string" + + @pytest.mark.parametrize("klass", [pd.array, pd.Series]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]]) + def test_boolean_dtype(self, data, skipna, klass): + # BooleanArray + val = klass(data, dtype="boolean") + inferred = lib.infer_dtype(val, skipna=skipna) + assert inferred == "boolean" + class TestNumberScalar: def test_is_number(self): diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 923b5a94c5f41..a6b5fed40a9d7 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -246,3 +246,12 @@ def test_convert_dtypes(self, data, maindtype, params, answerdict): # Make sure original not changed tm.assert_series_equal(series, copy) + + def test_convert_string_dtype(self): + # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns + # that are already string dtype + df = pd.DataFrame( + {"A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"]}, dtype="string" + ) + result = df.convert_dtypes() + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 62d26dacde67b..1338d801e39f4 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -7,6 +7,7 @@ from pandas._libs import lib +import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna import pandas._testing as tm import pandas.core.strings as strings @@ -207,6 +208,9 @@ def test_api_per_dtype(self, index_or_series, dtype, any_skipna_inferred_dtype): box = index_or_series inferred_dtype, values = any_skipna_inferred_dtype + if dtype == "category" and len(values) and values[1] is pd.NA: + pytest.xfail(reason="Categorical does not yet support pd.NA") + t = box(values, dtype=dtype) # explicit dtype to avoid casting # TODO: get rid of these xfails