diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..a54e6706fa286 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -42,6 +42,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_sql` * :func:`read_sql_query` * :func:`read_sql_table` +* :func:`read_orc` Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. diff --git a/pandas/io/_util.py b/pandas/io/_util.py new file mode 100644 index 0000000000000..d2a001f0cf925 --- /dev/null +++ b/pandas/io/_util.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from pandas.compat._optional import import_optional_dependency + +import pandas as pd + + +def _arrow_dtype_mapping() -> dict: + pa = import_optional_dependency("pyarrow") + return { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + pa.uint8(): pd.UInt8Dtype(), + pa.uint16(): pd.UInt16Dtype(), + pa.uint32(): pd.UInt32Dtype(), + pa.uint64(): pd.UInt64Dtype(), + pa.bool_(): pd.BooleanDtype(), + pa.string(): pd.StringDtype(), + pa.float32(): pd.Float32Dtype(), + pa.float64(): pd.Float64Dtype(), + } diff --git a/pandas/io/orc.py b/pandas/io/orc.py index cfa02de9bbcb3..169cb5d16da8d 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -91,18 +91,20 @@ def read_orc( pa_table = orc_file.read(columns=columns, **kwargs) if use_nullable_dtypes: dtype_backend = get_option("mode.dtype_backend") - if dtype_backend != "pyarrow": - raise NotImplementedError( - f"mode.dtype_backend set to {dtype_backend} is not implemented." + if dtype_backend == "pyarrow": + df = DataFrame( + { + col_name: ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } ) - df = DataFrame( - { - col_name: ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + else: + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() + df = pa_table.to_pandas(types_mapper=mapping.get) return df else: return pa_table.to_pandas() diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 568747685a36e..67e00dde5498b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -225,24 +225,13 @@ def read( dtype_backend = get_option("mode.dtype_backend") to_pandas_kwargs = {} if use_nullable_dtypes: - import pandas as pd if dtype_backend == "pandas": - mapping = { - self.api.int8(): pd.Int8Dtype(), - self.api.int16(): pd.Int16Dtype(), - self.api.int32(): pd.Int32Dtype(), - self.api.int64(): pd.Int64Dtype(), - self.api.uint8(): pd.UInt8Dtype(), - self.api.uint16(): pd.UInt16Dtype(), - self.api.uint32(): pd.UInt32Dtype(), - self.api.uint64(): pd.UInt64Dtype(), - self.api.bool_(): pd.BooleanDtype(), - self.api.string(): pd.StringDtype(), - self.api.float32(): pd.Float32Dtype(), - self.api.float64(): pd.Float64Dtype(), - } + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() to_pandas_kwargs["types_mapper"] = mapping.get + manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 87f648bb5acd6..d5c03dcc85a0d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import read_orc import pandas._testing as tm +from pandas.core.arrays import StringArray pytest.importorskip("pyarrow.orc") @@ -305,16 +306,6 @@ def test_orc_writer_dtypes_not_supported(df_not_supported): df_not_supported.to_orc() -def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath): - input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") - with pytest.raises( - NotImplementedError, - match="mode.dtype_backend set to pandas is not implemented.", - ): - with pd.option_context("mode.dtype_backend", "pandas"): - read_orc(input_file, use_nullable_dtypes=True) - - @td.skip_if_no("pyarrow", min_version="7.0.0") def test_orc_use_nullable_dtypes_pyarrow_backend(): df = pd.DataFrame( @@ -336,13 +327,60 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): ], } ) + bytes_data = df.copy().to_orc() with pd.option_context("mode.dtype_backend", "pyarrow"): result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + expected = pd.DataFrame( { col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True)) for col in df.columns } ) + + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_use_nullable_dtypes_pandas_backend(): + # GH#50503 + df = pd.DataFrame( + { + "string": list("abc"), + "string_with_nan": ["a", np.nan, "c"], + "string_with_none": ["a", None, "c"], + "int": list(range(1, 4)), + "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), + "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_nan": [2.0, np.nan, 3.0], + "bool": [True, False, True], + "bool_with_na": [True, False, None], + } + ) + + bytes_data = df.copy().to_orc() + with pd.option_context("mode.dtype_backend", "pandas"): + result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + + expected = pd.DataFrame( + { + "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)), + "string_with_nan": StringArray( + np.array(["a", pd.NA, "c"], dtype=np.object_) + ), + "string_with_none": StringArray( + np.array(["a", pd.NA, "c"], dtype=np.object_) + ), + "int": pd.Series([1, 2, 3], dtype="Int64"), + "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), + "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), + "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"), + "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"), + "bool": pd.Series([True, False, True], dtype="boolean"), + "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"), + } + ) + tm.assert_frame_equal(result, expected)