Skip to content

Commit 21a071f

Browse files
kkraus14jreback
authored andcommitted
add orc reader
1 parent 797732a commit 21a071f

11 files changed

+423
-0
lines changed

pandas/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@
165165
# misc
166166
read_clipboard,
167167
read_parquet,
168+
read_orc,
168169
read_feather,
169170
read_gbq,
170171
read_html,

pandas/core/config_init.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,22 @@ def use_inf_as_na_cb(key):
568568
validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
569569
)
570570

571+
572+
# Set up the io.orc specific configuration.
573+
orc_engine_doc = """
574+
: string
575+
The default orc reader/writer engine. Available options:
576+
'auto', 'pyarrow', the default is 'auto'
577+
"""
578+
579+
with cf.config_prefix("io.orc"):
580+
cf.register_option(
581+
"engine",
582+
"auto",
583+
orc_engine_doc,
584+
validator=is_one_of_factory(["auto", "pyarrow"]),
585+
)
586+
571587
# --------
572588
# Plotting
573589
# ---------

pandas/io/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.io.gbq import read_gbq
1111
from pandas.io.html import read_html
1212
from pandas.io.json import read_json
13+
from pandas.io.orc import read_orc
1314
from pandas.io.packers import read_msgpack, to_msgpack
1415
from pandas.io.parquet import read_parquet
1516
from pandas.io.parsers import read_csv, read_fwf, read_table

pandas/io/orc.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
""" orc compat """
2+
3+
from warnings import catch_warnings
4+
5+
from pandas.compat._optional import import_optional_dependency
6+
from pandas.errors import AbstractMethodError
7+
8+
from pandas import DataFrame, get_option
9+
10+
from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
11+
12+
13+
def get_engine(engine):
14+
""" return our implementation """
15+
16+
if engine == "auto":
17+
engine = get_option("io.orc.engine")
18+
19+
if engine == "auto":
20+
# try engines in this order
21+
try:
22+
return PyArrowImpl()
23+
except ImportError:
24+
pass
25+
26+
raise ImportError(
27+
"Unable to find a usable engine; "
28+
"tried using: 'pyarrow'.\n"
29+
"pyarrow is required for orc "
30+
"support"
31+
)
32+
33+
if engine not in ["pyarrow"]:
34+
raise ValueError("engine must be 'pyarrow'")
35+
36+
if engine == "pyarrow":
37+
return PyArrowImpl()
38+
39+
40+
class BaseImpl:
41+
42+
api = None # module
43+
44+
@staticmethod
45+
def validate_dataframe(df):
46+
47+
if not isinstance(df, DataFrame):
48+
raise ValueError("to_orc only supports IO with DataFrames")
49+
50+
# must have value column names (strings only)
51+
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
52+
raise ValueError("ORC must have string column names")
53+
54+
# index level names must be strings
55+
valid_names = all(
56+
isinstance(name, str) for name in df.index.names if name is not None
57+
)
58+
if not valid_names:
59+
raise ValueError("Index level names must be strings")
60+
61+
def write(self, df, path, compression, **kwargs):
62+
raise AbstractMethodError(self)
63+
64+
def read(self, path, columns=None, **kwargs):
65+
raise AbstractMethodError(self)
66+
67+
68+
class PyArrowImpl(BaseImpl):
69+
def __init__(self):
70+
pyarrow = import_optional_dependency(
71+
"pyarrow", extra="pyarrow is required for orc support."
72+
)
73+
import pyarrow.orc
74+
75+
self.api = pyarrow
76+
77+
def read(self, path, columns=None, **kwargs):
78+
path, _, _, _ = get_filepath_or_buffer(path)
79+
80+
py_file = self.api.input_stream(path)
81+
orc_file = self.api.orc.ORCFile(py_file)
82+
83+
result = orc_file.read(
84+
columns=columns, **kwargs
85+
).to_pandas()
86+
87+
return result
88+
89+
90+
def read_orc(path, engine="auto", columns=None, **kwargs):
91+
"""
92+
Load an ORC object from the file path, returning a DataFrame.
93+
94+
.. versionadded:: 0.21.0
95+
96+
Parameters
97+
----------
98+
path : str, path object or file-like object
99+
Any valid string path is acceptable. The string could be a URL. Valid
100+
URL schemes include http, ftp, s3, and file. For file URLs, a host is
101+
expected. A local file could be:
102+
``file://localhost/path/to/table.orc``.
103+
104+
If you want to pass in a path object, pandas accepts any
105+
``os.PathLike``.
106+
107+
By file-like object, we refer to objects with a ``read()`` method,
108+
such as a file handler (e.g. via builtin ``open`` function)
109+
or ``StringIO``.
110+
engine : {'auto', 'pyarrow'}, default 'auto'
111+
ORC library to use. If 'auto', then the option ``io.orc.engine`` is
112+
used. The default ``io.orc.engine`` behavior is to try 'pyarrow'.
113+
columns : list, default=None
114+
If not None, only these columns will be read from the file.
115+
116+
.. versionadded:: 0.21.1
117+
**kwargs
118+
Any additional kwargs are passed to the engine.
119+
120+
Returns
121+
-------
122+
DataFrame
123+
"""
124+
125+
impl = get_engine(engine)
126+
return impl.read(path, columns=columns, **kwargs)
Binary file not shown.
523 Bytes
Binary file not shown.
1.67 KB
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)