From 28090cf9dee13ee19011460be95ffd4d13d441d9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 15:27:49 +0100 Subject: [PATCH 1/5] ENH: Add pandas nullable support to read_orc --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/io/_util.py | 20 +++++++++++++++++ pandas/io/orc.py | 26 ++++++++++++--------- pandas/io/parquet.py | 19 ++++------------ pandas/tests/io/test_orc.py | 41 ++++++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 26 deletions(-) create mode 100644 pandas/io/_util.py diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..a54e6706fa286 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -42,6 +42,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_sql` * :func:`read_sql_query` * :func:`read_sql_table` +* :func:`read_orc` Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. diff --git a/pandas/io/_util.py b/pandas/io/_util.py new file mode 100644 index 0000000000000..14990d3cdced7 --- /dev/null +++ b/pandas/io/_util.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +import pandas as pd + + +def _arrow_dtype_mapping(api) -> dict: + return { + api.int8(): pd.Int8Dtype(), + api.int16(): pd.Int16Dtype(), + api.int32(): pd.Int32Dtype(), + api.int64(): pd.Int64Dtype(), + api.uint8(): pd.UInt8Dtype(), + api.uint16(): pd.UInt16Dtype(), + api.uint32(): pd.UInt32Dtype(), + api.uint64(): pd.UInt64Dtype(), + api.bool_(): pd.BooleanDtype(), + api.string(): pd.StringDtype(), + api.float32(): pd.Float32Dtype(), + api.float64(): pd.Float64Dtype(), + } diff --git a/pandas/io/orc.py b/pandas/io/orc.py index cfa02de9bbcb3..8b7416b429c0a 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -91,18 +91,22 @@ def read_orc( pa_table = orc_file.read(columns=columns, **kwargs) if use_nullable_dtypes: dtype_backend = get_option("mode.dtype_backend") - if dtype_backend != "pyarrow": - raise NotImplementedError( - f"mode.dtype_backend set to {dtype_backend} is not implemented." + if dtype_backend == "pyarrow": + df = DataFrame( + { + col_name: ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } ) - df = DataFrame( - { - col_name: ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + else: + from pandas.io._util import _arrow_dtype_mapping + + pa = import_optional_dependency("pyarrow") + mapping = _arrow_dtype_mapping(pa) + + df = pa_table.to_pandas(types_mapper=mapping.get) return df else: return pa_table.to_pandas() diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 568747685a36e..67b18f527505d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -225,24 +225,13 @@ def read( dtype_backend = get_option("mode.dtype_backend") to_pandas_kwargs = {} if use_nullable_dtypes: - import pandas as pd if dtype_backend == "pandas": - mapping = { - self.api.int8(): pd.Int8Dtype(), - self.api.int16(): pd.Int16Dtype(), - self.api.int32(): pd.Int32Dtype(), - self.api.int64(): pd.Int64Dtype(), - self.api.uint8(): pd.UInt8Dtype(), - self.api.uint16(): pd.UInt16Dtype(), - self.api.uint32(): pd.UInt32Dtype(), - self.api.uint64(): pd.UInt64Dtype(), - self.api.bool_(): pd.BooleanDtype(), - self.api.string(): pd.StringDtype(), - self.api.float32(): pd.Float32Dtype(), - self.api.float64(): pd.Float64Dtype(), - } + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping(self.api) to_pandas_kwargs["types_mapper"] = mapping.get + manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 87f648bb5acd6..8c1a2a347daba 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import read_orc import pandas._testing as tm +from pandas.core.arrays import StringArray pytest.importorskip("pyarrow.orc") @@ -346,3 +347,43 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): } ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_use_nullable_dtypes_pandas_backend(): + df = pd.DataFrame( + { + "string": list("abc"), + "string_with_nan": ["a", np.nan, "c"], + "string_with_none": ["a", None, "c"], + "int": list(range(1, 4)), + "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), + "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_nan": [2.0, np.nan, 3.0], + "bool": [True, False, True], + "bool_with_na": [True, False, None], + } + ) + bytes_data = df.copy().to_orc() + with pd.option_context("mode.dtype_backend", "pandas"): + result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + expected = pd.DataFrame( + { + "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)), + "string_with_nan": StringArray( + np.array(["a", pd.NA, "c"], dtype=np.object_) + ), + "string_with_none": StringArray( + np.array(["a", pd.NA, "c"], dtype=np.object_) + ), + "int": pd.Series([1, 2, 3], dtype="Int64"), + "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), + "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), + "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"), + "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"), + "bool": pd.Series([True, False, True], dtype="boolean"), + "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"), + } + ) + tm.assert_frame_equal(result, expected) From 3f789277d3dce7dce34d250d94f401514d2b3e37 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 15:28:29 +0100 Subject: [PATCH 2/5] Add gh ref --- pandas/tests/io/test_orc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 8c1a2a347daba..0934d12f9c48d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -351,6 +351,7 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): @td.skip_if_no("pyarrow", min_version="7.0.0") def test_orc_use_nullable_dtypes_pandas_backend(): + # GH#50503 df = pd.DataFrame( { "string": list("abc"), From 41cc25a9012ef99447ecf22bf26d0eb12a7c7401 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 16:11:45 +0100 Subject: [PATCH 3/5] Remove test --- pandas/tests/io/test_orc.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 0934d12f9c48d..52f2b52f617ee 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -306,16 +306,6 @@ def test_orc_writer_dtypes_not_supported(df_not_supported): df_not_supported.to_orc() -def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath): - input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") - with pytest.raises( - NotImplementedError, - match="mode.dtype_backend set to pandas is not implemented.", - ): - with pd.option_context("mode.dtype_backend", "pandas"): - read_orc(input_file, use_nullable_dtypes=True) - - @td.skip_if_no("pyarrow", min_version="7.0.0") def test_orc_use_nullable_dtypes_pyarrow_backend(): df = pd.DataFrame( From 7e06342af67362a15d5e8c2fa20135fe5888bd45 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Jan 2023 22:43:23 +0100 Subject: [PATCH 4/5] Reformat --- pandas/tests/io/test_orc.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 52f2b52f617ee..d5c03dcc85a0d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -327,15 +327,18 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): ], } ) + bytes_data = df.copy().to_orc() with pd.option_context("mode.dtype_backend", "pyarrow"): result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + expected = pd.DataFrame( { col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True)) for col in df.columns } ) + tm.assert_frame_equal(result, expected) @@ -356,9 +359,11 @@ def test_orc_use_nullable_dtypes_pandas_backend(): "bool_with_na": [True, False, None], } ) + bytes_data = df.copy().to_orc() with pd.option_context("mode.dtype_backend", "pandas"): result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + expected = pd.DataFrame( { "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)), @@ -377,4 +382,5 @@ def test_orc_use_nullable_dtypes_pandas_backend(): "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"), } ) + tm.assert_frame_equal(result, expected) From c7df8a7ebdcd4b141aa1a27fea6833e170e62f7b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 4 Jan 2023 21:27:34 +0100 Subject: [PATCH 5/5] Move import --- pandas/io/_util.py | 29 ++++++++++++++++------------- pandas/io/orc.py | 4 +--- pandas/io/parquet.py | 2 +- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 14990d3cdced7..d2a001f0cf925 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,20 +1,23 @@ from __future__ import annotations +from pandas.compat._optional import import_optional_dependency + import pandas as pd -def _arrow_dtype_mapping(api) -> dict: +def _arrow_dtype_mapping() -> dict: + pa = import_optional_dependency("pyarrow") return { - api.int8(): pd.Int8Dtype(), - api.int16(): pd.Int16Dtype(), - api.int32(): pd.Int32Dtype(), - api.int64(): pd.Int64Dtype(), - api.uint8(): pd.UInt8Dtype(), - api.uint16(): pd.UInt16Dtype(), - api.uint32(): pd.UInt32Dtype(), - api.uint64(): pd.UInt64Dtype(), - api.bool_(): pd.BooleanDtype(), - api.string(): pd.StringDtype(), - api.float32(): pd.Float32Dtype(), - api.float64(): pd.Float64Dtype(), + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + pa.uint8(): pd.UInt8Dtype(), + pa.uint16(): pd.UInt16Dtype(), + pa.uint32(): pd.UInt32Dtype(), + pa.uint64(): pd.UInt64Dtype(), + pa.bool_(): pd.BooleanDtype(), + pa.string(): pd.StringDtype(), + pa.float32(): pd.Float32Dtype(), + pa.float64(): pd.Float64Dtype(), } diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 8b7416b429c0a..169cb5d16da8d 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -103,9 +103,7 @@ def read_orc( else: from pandas.io._util import _arrow_dtype_mapping - pa = import_optional_dependency("pyarrow") - mapping = _arrow_dtype_mapping(pa) - + mapping = _arrow_dtype_mapping() df = pa_table.to_pandas(types_mapper=mapping.get) return df else: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 67b18f527505d..67e00dde5498b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -229,7 +229,7 @@ def read( if dtype_backend == "pandas": from pandas.io._util import _arrow_dtype_mapping - mapping = _arrow_dtype_mapping(self.api) + mapping = _arrow_dtype_mapping() to_pandas_kwargs["types_mapper"] = mapping.get manager = get_option("mode.data_manager")