From 681ead8ce748d70394bb3c74519ca800e67f3e23 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 00:19:52 +0100 Subject: [PATCH 1/3] ENH: Add use_nullable_dtypes to read_feather --- doc/source/whatsnew/v2.0.0.rst | 2 ++ pandas/io/feather_format.py | 49 +++++++++++++++++++++++--- pandas/tests/io/test_feather.py | 61 +++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 033f47f0c994d..18fd3ca829b31 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -45,6 +45,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_sql_query` * :func:`read_sql_table` * :func:`read_orc` +* :func:`read_feather` * :func:`to_numeric` Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions @@ -57,6 +58,7 @@ to select the nullable dtypes implementation. * :func:`read_xml` * :func:`read_parquet` * :func:`read_orc` +* :func:`read_feather` And the following methods will also utilize the ``mode.dtype_backend`` option. diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index e781da74e97aa..bf6ccc0bb226f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,6 +15,10 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc +from pandas import ( + arrays, + get_option, +) from pandas.core.api import ( DataFrame, NumericIndex, @@ -99,6 +103,7 @@ def read_feather( columns: Sequence[Hashable] | None = None, use_threads: bool = True, storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ): """ Load a feather-format object from the file path. @@ -118,6 +123,19 @@ def read_feather( .. versionadded:: 1.2.0 + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0 + Returns ------- type of object stored in file @@ -128,7 +146,30 @@ def read_feather( with get_handle( path, "rb", storage_options=storage_options, is_text=False ) as handles: - - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + if not use_nullable_dtypes: + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) + + dtype_backend = get_option("mode.dtype_backend") + + if dtype_backend == "pandas": + from pandas.io._util import _arrow_dtype_mapping + + pa_table = feather.read_table( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) + return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) + + elif dtype_backend == "pyarrow": + result = feather.read_table( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) + return DataFrame( + { + col_name: arrays.ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + result.column_names, result.itercolumns() + ) + } + ) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 88bf04f518e12..ed737cfce28f7 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -4,6 +4,10 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io.feather_format import read_feather, to_feather # isort:skip @@ -194,3 +198,60 @@ def test_http_path(self, feather_file): expected = read_feather(feather_file) res = read_feather(url) tm.assert_frame_equal(expected, res) + + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) + def test_read_json_nullable(self, string_storage, dtype_backend): + # GH# + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, np.nan, 3], dtype="Int64"), + "b": pd.Series([1, 2, 3], dtype="Int64"), + "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) + + if string_storage == "python": + string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) + string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) + + else: + string_array = ArrowStringArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + + with tm.ensure_clean() as path: + to_feather(df, path) + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_feather(path, use_nullable_dtypes=True) + + expected = pd.DataFrame( + { + "a": pd.Series([1, np.nan, 3], dtype="Int64"), + "b": pd.Series([1, 2, 3], dtype="Int64"), + "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": pd.Series([True, False, pd.NA], dtype="boolean"), + "f": pd.Series([True, False, True], dtype="boolean"), + "g": string_array, + "h": string_array_na, + } + ) + + if dtype_backend == "pyarrow": + + from pandas.arrays import ArrowExtensionArray + + expected = pd.DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + + tm.assert_frame_equal(result, expected) From 1cd8fe8b953e6696b06c903808890e9f8b12b86b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 00:20:46 +0100 Subject: [PATCH 2/3] Add gh ref --- pandas/tests/io/test_feather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index ed737cfce28f7..28a6054098a6f 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -201,7 +201,7 @@ def test_http_path(self, feather_file): @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) def test_read_json_nullable(self, string_storage, dtype_backend): - # GH# + # GH#50765 pa = pytest.importorskip("pyarrow") df = pd.DataFrame( { From bdafeae04b900f859ff901d36e1d9fe85792c597 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 16 Jan 2023 19:53:59 +0100 Subject: [PATCH 3/3] Refactor --- pandas/io/feather_format.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index bf6ccc0bb226f..cb2890777621a 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -153,23 +153,21 @@ def read_feather( dtype_backend = get_option("mode.dtype_backend") + pa_table = feather.read_table( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) + if dtype_backend == "pandas": from pandas.io._util import _arrow_dtype_mapping - pa_table = feather.read_table( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) elif dtype_backend == "pyarrow": - result = feather.read_table( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) return DataFrame( { col_name: arrays.ArrowExtensionArray(pa_col) for col_name, pa_col in zip( - result.column_names, result.itercolumns() + pa_table.column_names, pa_table.itercolumns() ) } )