diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 033f47f0c994d..18fd3ca829b31 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -45,6 +45,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_sql_query` * :func:`read_sql_table` * :func:`read_orc` +* :func:`read_feather` * :func:`to_numeric` Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions @@ -57,6 +58,7 @@ to select the nullable dtypes implementation. * :func:`read_xml` * :func:`read_parquet` * :func:`read_orc` +* :func:`read_feather` And the following methods will also utilize the ``mode.dtype_backend`` option. diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index e781da74e97aa..cb2890777621a 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,6 +15,10 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc +from pandas import ( + arrays, + get_option, +) from pandas.core.api import ( DataFrame, NumericIndex, @@ -99,6 +103,7 @@ def read_feather( columns: Sequence[Hashable] | None = None, use_threads: bool = True, storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ): """ Load a feather-format object from the file path. @@ -118,6 +123,19 @@ def read_feather( .. versionadded:: 1.2.0 + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0 + Returns ------- type of object stored in file @@ -128,7 +146,28 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: + if not use_nullable_dtypes: + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) - return feather.read_feather( + dtype_backend = get_option("mode.dtype_backend") + + pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) + + if dtype_backend == "pandas": + from pandas.io._util import _arrow_dtype_mapping + + return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) + + elif dtype_backend == "pyarrow": + return DataFrame( + { + col_name: arrays.ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } + ) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 88bf04f518e12..28a6054098a6f 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -4,6 +4,10 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io.feather_format import read_feather, to_feather # isort:skip @@ -194,3 +198,60 @@ def test_http_path(self, feather_file): expected = read_feather(feather_file) res = read_feather(url) tm.assert_frame_equal(expected, res) + + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) + def test_read_json_nullable(self, string_storage, dtype_backend): + # GH#50765 + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, np.nan, 3], dtype="Int64"), + "b": pd.Series([1, 2, 3], dtype="Int64"), + "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) + + if string_storage == "python": + string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) + string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + + else: + string_array = ArrowStringArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + + with tm.ensure_clean() as path: + to_feather(df, path) + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_feather(path, use_nullable_dtypes=True) + + expected = pd.DataFrame( + { + "a": pd.Series([1, np.nan, 3], dtype="Int64"), + "b": pd.Series([1, 2, 3], dtype="Int64"), + "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": pd.Series([True, False, pd.NA], dtype="boolean"), + "f": pd.Series([True, False, True], dtype="boolean"), + "g": string_array, + "h": string_array_na, + } + ) + + if dtype_backend == "pyarrow": + + from pandas.arrays import ArrowExtensionArray + + expected = pd.DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + + tm.assert_frame_equal(result, expected)