From 28090cf9dee13ee19011460be95ffd4d13d441d9 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Fri, 30 Dec 2022 15:27:49 +0100
Subject: [PATCH 1/5] ENH: Add pandas nullable support to read_orc

---
 doc/source/whatsnew/v2.0.0.rst |  1 +
 pandas/io/_util.py             | 20 +++++++++++++++++
 pandas/io/orc.py               | 26 ++++++++++++---------
 pandas/io/parquet.py           | 19 ++++------------
 pandas/tests/io/test_orc.py    | 41 ++++++++++++++++++++++++++++++++++
 5 files changed, 81 insertions(+), 26 deletions(-)
 create mode 100644 pandas/io/_util.py

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index b1387e9717079..a54e6706fa286 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -42,6 +42,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_sql`
 * :func:`read_sql_query`
 * :func:`read_sql_table`
+* :func:`read_orc`
 
 Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
 to select the nullable dtypes implementation.
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
new file mode 100644
index 0000000000000..14990d3cdced7
--- /dev/null
+++ b/pandas/io/_util.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import pandas as pd
+
+
+def _arrow_dtype_mapping(api) -> dict:
+    return {
+        api.int8(): pd.Int8Dtype(),
+        api.int16(): pd.Int16Dtype(),
+        api.int32(): pd.Int32Dtype(),
+        api.int64(): pd.Int64Dtype(),
+        api.uint8(): pd.UInt8Dtype(),
+        api.uint16(): pd.UInt16Dtype(),
+        api.uint32(): pd.UInt32Dtype(),
+        api.uint64(): pd.UInt64Dtype(),
+        api.bool_(): pd.BooleanDtype(),
+        api.string(): pd.StringDtype(),
+        api.float32(): pd.Float32Dtype(),
+        api.float64(): pd.Float64Dtype(),
+    }
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index cfa02de9bbcb3..8b7416b429c0a 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -91,18 +91,22 @@ def read_orc(
         pa_table = orc_file.read(columns=columns, **kwargs)
     if use_nullable_dtypes:
         dtype_backend = get_option("mode.dtype_backend")
-        if dtype_backend != "pyarrow":
-            raise NotImplementedError(
-                f"mode.dtype_backend set to {dtype_backend} is not implemented."
+        if dtype_backend == "pyarrow":
+            df = DataFrame(
+                {
+                    col_name: ArrowExtensionArray(pa_col)
+                    for col_name, pa_col in zip(
+                        pa_table.column_names, pa_table.itercolumns()
+                    )
+                }
             )
-        df = DataFrame(
-            {
-                col_name: ArrowExtensionArray(pa_col)
-                for col_name, pa_col in zip(
-                    pa_table.column_names, pa_table.itercolumns()
-                )
-            }
-        )
+        else:
+            from pandas.io._util import _arrow_dtype_mapping
+
+            pa = import_optional_dependency("pyarrow")
+            mapping = _arrow_dtype_mapping(pa)
+
+            df = pa_table.to_pandas(types_mapper=mapping.get)
         return df
     else:
         return pa_table.to_pandas()
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 568747685a36e..67b18f527505d 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -225,24 +225,13 @@ def read(
         dtype_backend = get_option("mode.dtype_backend")
         to_pandas_kwargs = {}
         if use_nullable_dtypes:
-            import pandas as pd
 
             if dtype_backend == "pandas":
-                mapping = {
-                    self.api.int8(): pd.Int8Dtype(),
-                    self.api.int16(): pd.Int16Dtype(),
-                    self.api.int32(): pd.Int32Dtype(),
-                    self.api.int64(): pd.Int64Dtype(),
-                    self.api.uint8(): pd.UInt8Dtype(),
-                    self.api.uint16(): pd.UInt16Dtype(),
-                    self.api.uint32(): pd.UInt32Dtype(),
-                    self.api.uint64(): pd.UInt64Dtype(),
-                    self.api.bool_(): pd.BooleanDtype(),
-                    self.api.string(): pd.StringDtype(),
-                    self.api.float32(): pd.Float32Dtype(),
-                    self.api.float64(): pd.Float64Dtype(),
-                }
+                from pandas.io._util import _arrow_dtype_mapping
+
+                mapping = _arrow_dtype_mapping(self.api)
                 to_pandas_kwargs["types_mapper"] = mapping.get
+
         manager = get_option("mode.data_manager")
         if manager == "array":
             to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 87f648bb5acd6..8c1a2a347daba 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -11,6 +11,7 @@
 import pandas as pd
 from pandas import read_orc
 import pandas._testing as tm
+from pandas.core.arrays import StringArray
 
 pytest.importorskip("pyarrow.orc")
 
@@ -346,3 +347,43 @@ def test_orc_use_nullable_dtypes_pyarrow_backend():
         }
     )
     tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow", min_version="7.0.0")
+def test_orc_use_nullable_dtypes_pandas_backend():
+    df = pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "int": list(range(1, 4)),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "bool_with_na": [True, False, None],
+        }
+    )
+    bytes_data = df.copy().to_orc()
+    with pd.option_context("mode.dtype_backend", "pandas"):
+        result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
+    expected = pd.DataFrame(
+        {
+            "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
+            "string_with_nan": StringArray(
+                np.array(["a", pd.NA, "c"], dtype=np.object_)
+            ),
+            "string_with_none": StringArray(
+                np.array(["a", pd.NA, "c"], dtype=np.object_)
+            ),
+            "int": pd.Series([1, 2, 3], dtype="Int64"),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
+            "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
+            "bool": pd.Series([True, False, True], dtype="boolean"),
+            "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
+        }
+    )
+    tm.assert_frame_equal(result, expected)

From 3f789277d3dce7dce34d250d94f401514d2b3e37 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Fri, 30 Dec 2022 15:28:29 +0100
Subject: [PATCH 2/5] Add gh ref

---
 pandas/tests/io/test_orc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 8c1a2a347daba..0934d12f9c48d 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -351,6 +351,7 @@ def test_orc_use_nullable_dtypes_pyarrow_backend():
 
 @td.skip_if_no("pyarrow", min_version="7.0.0")
 def test_orc_use_nullable_dtypes_pandas_backend():
+    # GH#50503
     df = pd.DataFrame(
         {
             "string": list("abc"),

From 41cc25a9012ef99447ecf22bf26d0eb12a7c7401 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Fri, 30 Dec 2022 16:11:45 +0100
Subject: [PATCH 3/5] Remove test

---
 pandas/tests/io/test_orc.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 0934d12f9c48d..52f2b52f617ee 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -306,16 +306,6 @@ def test_orc_writer_dtypes_not_supported(df_not_supported):
         df_not_supported.to_orc()
 
 
-def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath):
-    input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
-    with pytest.raises(
-        NotImplementedError,
-        match="mode.dtype_backend set to pandas is not implemented.",
-    ):
-        with pd.option_context("mode.dtype_backend", "pandas"):
-            read_orc(input_file, use_nullable_dtypes=True)
-
-
 @td.skip_if_no("pyarrow", min_version="7.0.0")
 def test_orc_use_nullable_dtypes_pyarrow_backend():
     df = pd.DataFrame(

From 7e06342af67362a15d5e8c2fa20135fe5888bd45 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Tue, 3 Jan 2023 22:43:23 +0100
Subject: [PATCH 4/5] Reformat

---
 pandas/tests/io/test_orc.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 52f2b52f617ee..d5c03dcc85a0d 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -327,15 +327,18 @@ def test_orc_use_nullable_dtypes_pyarrow_backend():
             ],
         }
     )
+
     bytes_data = df.copy().to_orc()
     with pd.option_context("mode.dtype_backend", "pyarrow"):
         result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
+
     expected = pd.DataFrame(
         {
             col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
             for col in df.columns
         }
     )
+
     tm.assert_frame_equal(result, expected)
 
 
@@ -356,9 +359,11 @@ def test_orc_use_nullable_dtypes_pandas_backend():
             "bool_with_na": [True, False, None],
         }
     )
+
     bytes_data = df.copy().to_orc()
     with pd.option_context("mode.dtype_backend", "pandas"):
         result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
+
     expected = pd.DataFrame(
         {
             "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
@@ -377,4 +382,5 @@ def test_orc_use_nullable_dtypes_pandas_backend():
             "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
         }
     )
+
     tm.assert_frame_equal(result, expected)

From c7df8a7ebdcd4b141aa1a27fea6833e170e62f7b Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Wed, 4 Jan 2023 21:27:34 +0100
Subject: [PATCH 5/5] Move import

---
 pandas/io/_util.py   | 29 ++++++++++++++++-------------
 pandas/io/orc.py     |  4 +---
 pandas/io/parquet.py |  2 +-
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/pandas/io/_util.py b/pandas/io/_util.py
index 14990d3cdced7..d2a001f0cf925 100644
--- a/pandas/io/_util.py
+++ b/pandas/io/_util.py
@@ -1,20 +1,23 @@
 from __future__ import annotations
 
+from pandas.compat._optional import import_optional_dependency
+
 import pandas as pd
 
 
-def _arrow_dtype_mapping(api) -> dict:
+def _arrow_dtype_mapping() -> dict:
+    pa = import_optional_dependency("pyarrow")
     return {
-        api.int8(): pd.Int8Dtype(),
-        api.int16(): pd.Int16Dtype(),
-        api.int32(): pd.Int32Dtype(),
-        api.int64(): pd.Int64Dtype(),
-        api.uint8(): pd.UInt8Dtype(),
-        api.uint16(): pd.UInt16Dtype(),
-        api.uint32(): pd.UInt32Dtype(),
-        api.uint64(): pd.UInt64Dtype(),
-        api.bool_(): pd.BooleanDtype(),
-        api.string(): pd.StringDtype(),
-        api.float32(): pd.Float32Dtype(),
-        api.float64(): pd.Float64Dtype(),
+        pa.int8(): pd.Int8Dtype(),
+        pa.int16(): pd.Int16Dtype(),
+        pa.int32(): pd.Int32Dtype(),
+        pa.int64(): pd.Int64Dtype(),
+        pa.uint8(): pd.UInt8Dtype(),
+        pa.uint16(): pd.UInt16Dtype(),
+        pa.uint32(): pd.UInt32Dtype(),
+        pa.uint64(): pd.UInt64Dtype(),
+        pa.bool_(): pd.BooleanDtype(),
+        pa.string(): pd.StringDtype(),
+        pa.float32(): pd.Float32Dtype(),
+        pa.float64(): pd.Float64Dtype(),
     }
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index 8b7416b429c0a..169cb5d16da8d 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -103,9 +103,7 @@ def read_orc(
         else:
             from pandas.io._util import _arrow_dtype_mapping
 
-            pa = import_optional_dependency("pyarrow")
-            mapping = _arrow_dtype_mapping(pa)
-
+            mapping = _arrow_dtype_mapping()
             df = pa_table.to_pandas(types_mapper=mapping.get)
         return df
     else:
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 67b18f527505d..67e00dde5498b 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -229,7 +229,7 @@ def read(
             if dtype_backend == "pandas":
                 from pandas.io._util import _arrow_dtype_mapping
 
-                mapping = _arrow_dtype_mapping(self.api)
+                mapping = _arrow_dtype_mapping()
                 to_pandas_kwargs["types_mapper"] = mapping.get
 
         manager = get_option("mode.data_manager")