diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c814e585672cb..2f98f03efbd14 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -47,6 +47,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_sql_table` * :func:`read_orc` * :func:`read_feather` +* :func:`read_spss` * :func:`to_numeric` To simplify opting-in to nullable dtypes for these functions, a new option ``nullable_dtypes`` was added that allows setting @@ -75,6 +76,7 @@ to select the nullable dtypes implementation. * :func:`read_parquet` * :func:`read_orc` * :func:`read_feather` +* :func:`read_spss` * :func:`to_numeric` diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 630b78497d32f..bb9ace600e6f2 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -6,6 +6,9 @@ Sequence, ) +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.inference import is_list_like @@ -20,6 +23,7 @@ def read_spss( path: str | Path, usecols: Sequence[str] | None = None, convert_categoricals: bool = True, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, ) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. @@ -32,6 +36,20 @@ def read_spss( Return a subset of the columns. If None, return all columns. convert_categoricals : bool, default is True Convert categorical columns into pd.Categorical. + use_nullable_dtypes : bool = False + Whether to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + .. note:: + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0 Returns ------- @@ -39,6 +57,12 @@ def read_spss( """ pyreadstat = import_optional_dependency("pyreadstat") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + if usecols is not None: if not is_list_like(usecols): raise TypeError("usecols must be list-like.") @@ -47,4 +71,6 @@ def read_spss( df, _ = pyreadstat.read_sav( stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals ) + if use_nullable_dtypes: + df = df.convert_dtypes() return df diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index d507ab07b7cd1..c2bcf3601d5fa 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -80,3 +80,27 @@ def test_spss_usecols(datapath): with pytest.raises(TypeError, match="usecols must be list-like."): pd.read_spss(fname, usecols="VAR00002") + + +@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) +def test_spss_umlauts_use_nullable_dtypes(datapath, dtype_backend): + # test file from the Haven project (https://haven.tidyverse.org/) + fname = datapath("io", "data", "spss", "umlauts.sav") + + with pd.option_context("mode.dtype_backend", dtype_backend): + df = pd.read_spss(fname, convert_categoricals=False, use_nullable_dtypes=True) + expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}, dtype="Int64") + + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + + from pandas.arrays import ArrowExtensionArray + + expected = pd.DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + + tm.assert_frame_equal(df, expected)