Skip to content

Commit 222e37d

Browse files
authored
ENH: Add pandas nullable support to read_orc (#50503)
* ENH: Add pandas nullable support to read_orc * Add gh ref * Remove test * Reformat * Move import
1 parent 60dc3f5 commit 222e37d

File tree

5 files changed

+89
-36
lines changed

5 files changed

+89
-36
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
4343
* :func:`read_sql`
4444
* :func:`read_sql_query`
4545
* :func:`read_sql_table`
46+
* :func:`read_orc`
4647

4748
Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
4849
to select the nullable dtypes implementation.

pandas/io/_util.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from __future__ import annotations
2+
3+
from pandas.compat._optional import import_optional_dependency
4+
5+
import pandas as pd
6+
7+
8+
def _arrow_dtype_mapping() -> dict:
9+
pa = import_optional_dependency("pyarrow")
10+
return {
11+
pa.int8(): pd.Int8Dtype(),
12+
pa.int16(): pd.Int16Dtype(),
13+
pa.int32(): pd.Int32Dtype(),
14+
pa.int64(): pd.Int64Dtype(),
15+
pa.uint8(): pd.UInt8Dtype(),
16+
pa.uint16(): pd.UInt16Dtype(),
17+
pa.uint32(): pd.UInt32Dtype(),
18+
pa.uint64(): pd.UInt64Dtype(),
19+
pa.bool_(): pd.BooleanDtype(),
20+
pa.string(): pd.StringDtype(),
21+
pa.float32(): pd.Float32Dtype(),
22+
pa.float64(): pd.Float64Dtype(),
23+
}

pandas/io/orc.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -91,18 +91,20 @@ def read_orc(
9191
pa_table = orc_file.read(columns=columns, **kwargs)
9292
if use_nullable_dtypes:
9393
dtype_backend = get_option("mode.dtype_backend")
94-
if dtype_backend != "pyarrow":
95-
raise NotImplementedError(
96-
f"mode.dtype_backend set to {dtype_backend} is not implemented."
94+
if dtype_backend == "pyarrow":
95+
df = DataFrame(
96+
{
97+
col_name: ArrowExtensionArray(pa_col)
98+
for col_name, pa_col in zip(
99+
pa_table.column_names, pa_table.itercolumns()
100+
)
101+
}
97102
)
98-
df = DataFrame(
99-
{
100-
col_name: ArrowExtensionArray(pa_col)
101-
for col_name, pa_col in zip(
102-
pa_table.column_names, pa_table.itercolumns()
103-
)
104-
}
105-
)
103+
else:
104+
from pandas.io._util import _arrow_dtype_mapping
105+
106+
mapping = _arrow_dtype_mapping()
107+
df = pa_table.to_pandas(types_mapper=mapping.get)
106108
return df
107109
else:
108110
return pa_table.to_pandas()

pandas/io/parquet.py

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -225,24 +225,13 @@ def read(
225225
dtype_backend = get_option("mode.dtype_backend")
226226
to_pandas_kwargs = {}
227227
if use_nullable_dtypes:
228-
import pandas as pd
229228

230229
if dtype_backend == "pandas":
231-
mapping = {
232-
self.api.int8(): pd.Int8Dtype(),
233-
self.api.int16(): pd.Int16Dtype(),
234-
self.api.int32(): pd.Int32Dtype(),
235-
self.api.int64(): pd.Int64Dtype(),
236-
self.api.uint8(): pd.UInt8Dtype(),
237-
self.api.uint16(): pd.UInt16Dtype(),
238-
self.api.uint32(): pd.UInt32Dtype(),
239-
self.api.uint64(): pd.UInt64Dtype(),
240-
self.api.bool_(): pd.BooleanDtype(),
241-
self.api.string(): pd.StringDtype(),
242-
self.api.float32(): pd.Float32Dtype(),
243-
self.api.float64(): pd.Float64Dtype(),
244-
}
230+
from pandas.io._util import _arrow_dtype_mapping
231+
232+
mapping = _arrow_dtype_mapping()
245233
to_pandas_kwargs["types_mapper"] = mapping.get
234+
246235
manager = get_option("mode.data_manager")
247236
if manager == "array":
248237
to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment]

pandas/tests/io/test_orc.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pandas as pd
1212
from pandas import read_orc
1313
import pandas._testing as tm
14+
from pandas.core.arrays import StringArray
1415

1516
pytest.importorskip("pyarrow.orc")
1617

@@ -305,16 +306,6 @@ def test_orc_writer_dtypes_not_supported(df_not_supported):
305306
df_not_supported.to_orc()
306307

307308

308-
def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath):
309-
input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
310-
with pytest.raises(
311-
NotImplementedError,
312-
match="mode.dtype_backend set to pandas is not implemented.",
313-
):
314-
with pd.option_context("mode.dtype_backend", "pandas"):
315-
read_orc(input_file, use_nullable_dtypes=True)
316-
317-
318309
@td.skip_if_no("pyarrow", min_version="7.0.0")
319310
def test_orc_use_nullable_dtypes_pyarrow_backend():
320311
df = pd.DataFrame(
@@ -336,13 +327,60 @@ def test_orc_use_nullable_dtypes_pyarrow_backend():
336327
],
337328
}
338329
)
330+
339331
bytes_data = df.copy().to_orc()
340332
with pd.option_context("mode.dtype_backend", "pyarrow"):
341333
result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
334+
342335
expected = pd.DataFrame(
343336
{
344337
col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
345338
for col in df.columns
346339
}
347340
)
341+
342+
tm.assert_frame_equal(result, expected)
343+
344+
345+
@td.skip_if_no("pyarrow", min_version="7.0.0")
346+
def test_orc_use_nullable_dtypes_pandas_backend():
347+
# GH#50503
348+
df = pd.DataFrame(
349+
{
350+
"string": list("abc"),
351+
"string_with_nan": ["a", np.nan, "c"],
352+
"string_with_none": ["a", None, "c"],
353+
"int": list(range(1, 4)),
354+
"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
355+
"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
356+
"float": np.arange(4.0, 7.0, dtype="float64"),
357+
"float_with_nan": [2.0, np.nan, 3.0],
358+
"bool": [True, False, True],
359+
"bool_with_na": [True, False, None],
360+
}
361+
)
362+
363+
bytes_data = df.copy().to_orc()
364+
with pd.option_context("mode.dtype_backend", "pandas"):
365+
result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
366+
367+
expected = pd.DataFrame(
368+
{
369+
"string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
370+
"string_with_nan": StringArray(
371+
np.array(["a", pd.NA, "c"], dtype=np.object_)
372+
),
373+
"string_with_none": StringArray(
374+
np.array(["a", pd.NA, "c"], dtype=np.object_)
375+
),
376+
"int": pd.Series([1, 2, 3], dtype="Int64"),
377+
"int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
378+
"na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
379+
"float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
380+
"float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
381+
"bool": pd.Series([True, False, True], dtype="boolean"),
382+
"bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
383+
}
384+
)
385+
348386
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)