ENH: Add pandas nullable support to read_orc (#50503)

phofl · web-flow · commit 222e37ddc7cb · 2023-01-05T15:43:15.000-08:00
* ENH: Add pandas nullable support to read_orc

* Add gh ref

* Remove test

* Reformat

* Move import
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -43,6 +43,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_sql`
 * :func:`read_sql_query`
 * :func:`read_sql_table`
+* :func:`read_orc`
 
 Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
 to select the nullable dtypes implementation.
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+from pandas.compat._optional import import_optional_dependency
+
+import pandas as pd
+
+
+def _arrow_dtype_mapping() -> dict:
+    pa = import_optional_dependency("pyarrow")
+    return {
+        pa.int8(): pd.Int8Dtype(),
+        pa.int16(): pd.Int16Dtype(),
+        pa.int32(): pd.Int32Dtype(),
+        pa.int64(): pd.Int64Dtype(),
+        pa.uint8(): pd.UInt8Dtype(),
+        pa.uint16(): pd.UInt16Dtype(),
+        pa.uint32(): pd.UInt32Dtype(),
+        pa.uint64(): pd.UInt64Dtype(),
+        pa.bool_(): pd.BooleanDtype(),
+        pa.string(): pd.StringDtype(),
+        pa.float32(): pd.Float32Dtype(),
+        pa.float64(): pd.Float64Dtype(),
+    }
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -91,18 +91,20 @@ def read_orc(
         pa_table = orc_file.read(columns=columns, **kwargs)
     if use_nullable_dtypes:
         dtype_backend = get_option("mode.dtype_backend")
-        if dtype_backend != "pyarrow":
-            raise NotImplementedError(
-                f"mode.dtype_backend set to {dtype_backend} is not implemented."
+        if dtype_backend == "pyarrow":
+            df = DataFrame(
+                {
+                    col_name: ArrowExtensionArray(pa_col)
+                    for col_name, pa_col in zip(
+                        pa_table.column_names, pa_table.itercolumns()
+                    )
+                }
             )
-        df = DataFrame(
-            {
-                col_name: ArrowExtensionArray(pa_col)
-                for col_name, pa_col in zip(
-                    pa_table.column_names, pa_table.itercolumns()
-                )
-            }
-        )
+        else:
+            from pandas.io._util import _arrow_dtype_mapping
+
+            mapping = _arrow_dtype_mapping()
+            df = pa_table.to_pandas(types_mapper=mapping.get)
         return df
     else:
         return pa_table.to_pandas()
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -225,24 +225,13 @@ def read(
         dtype_backend = get_option("mode.dtype_backend")
         to_pandas_kwargs = {}
         if use_nullable_dtypes:
-            import pandas as pd
 
             if dtype_backend == "pandas":
-                mapping = {
-                    self.api.int8(): pd.Int8Dtype(),
-                    self.api.int16(): pd.Int16Dtype(),
-                    self.api.int32(): pd.Int32Dtype(),
-                    self.api.int64(): pd.Int64Dtype(),
-                    self.api.uint8(): pd.UInt8Dtype(),
-                    self.api.uint16(): pd.UInt16Dtype(),
-                    self.api.uint32(): pd.UInt32Dtype(),
-                    self.api.uint64(): pd.UInt64Dtype(),
-                    self.api.bool_(): pd.BooleanDtype(),
-                    self.api.string(): pd.StringDtype(),
-                    self.api.float32(): pd.Float32Dtype(),
-                    self.api.float64(): pd.Float64Dtype(),
-                }
+                from pandas.io._util import _arrow_dtype_mapping
+
+                mapping = _arrow_dtype_mapping()
                 to_pandas_kwargs["types_mapper"] = mapping.get
+
         manager = get_option("mode.data_manager")
         if manager == "array":
             to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -11,6 +11,7 @@
 import pandas as pd
 from pandas import read_orc
 import pandas._testing as tm
+from pandas.core.arrays import StringArray
 
 pytest.importorskip("pyarrow.orc")
 
@@ -305,16 +306,6 @@ def test_orc_writer_dtypes_not_supported(df_not_supported):
         df_not_supported.to_orc()
 
 
-def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath):
-    input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
-    with pytest.raises(
-        NotImplementedError,
-        match="mode.dtype_backend set to pandas is not implemented.",
-    ):
-        with pd.option_context("mode.dtype_backend", "pandas"):
-            read_orc(input_file, use_nullable_dtypes=True)
-
-
 @td.skip_if_no("pyarrow", min_version="7.0.0")
 def test_orc_use_nullable_dtypes_pyarrow_backend():
     df = pd.DataFrame(
@@ -336,13 +327,60 @@ def test_orc_use_nullable_dtypes_pyarrow_backend():
             ],
         }
     )
+
     bytes_data = df.copy().to_orc()
     with pd.option_context("mode.dtype_backend", "pyarrow"):
         result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
+
     expected = pd.DataFrame(
         {
             col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
             for col in df.columns
         }
     )
+
+    tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow", min_version="7.0.0")
+def test_orc_use_nullable_dtypes_pandas_backend():
+    # GH#50503
+    df = pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "int": list(range(1, 4)),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "bool_with_na": [True, False, None],
+        }
+    )
+
+    bytes_data = df.copy().to_orc()
+    with pd.option_context("mode.dtype_backend", "pandas"):
+        result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
+
+    expected = pd.DataFrame(
+        {
+            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
+            "string_with_nan": StringArray(
+                np.array(["a", pd.NA, "c"], dtype=np.object_)
+            ),
+            "string_with_none": StringArray(
+                np.array(["a", pd.NA, "c"], dtype=np.object_)
+            ),
+            "int": pd.Series([1, 2, 3], dtype="Int64"),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
+            "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
+            "bool": pd.Series([True, False, True], dtype="boolean"),
+            "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
+        }
+    )
+
     tm.assert_frame_equal(result, expected)