-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
[EHN] pandas.DataFrame.to_orc #44554
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9a7b29a
d11026f
0146ac3
0571602
d970b58
8b12e9f
65e6b7a
2114616
e4b40ef
a7aa3e0
1ab9b6c
96969d5
2a54b8c
1caec9e
6f0a538
ae65214
045c411
c00ed0f
fe275d7
9d3e0df
971f31c
52b68a0
76437ba
c5d5852
b5cd022
7ad3df9
a73bb70
20aefe7
e7e81fe
6b659f7
18e5429
21cba6e
c7bf39f
e43c6dd
afa0a8a
cd585e6
b509c3c
1001002
55cab6e
89283e0
989468a
a7fca36
7fc338c
91d1556
a28c5a8
162e5bb
b230583
e16edab
e4770b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -159,6 +159,7 @@ ORC | |
:toctree: api/ | ||
|
||
read_orc | ||
DataFrame.to_orc | ||
|
||
SAS | ||
~~~ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,28 @@ | ||
""" orc compat """ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
import io | ||
from types import ModuleType | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
Literal, | ||
) | ||
|
||
from pandas._typing import ( | ||
FilePath, | ||
ReadBuffer, | ||
WriteBuffer, | ||
) | ||
from pandas.compat._optional import import_optional_dependency | ||
|
||
from pandas.core.dtypes.common import ( | ||
is_categorical_dtype, | ||
is_interval_dtype, | ||
is_period_dtype, | ||
is_unsigned_integer_dtype, | ||
) | ||
|
||
from pandas.io.common import get_handle | ||
|
||
if TYPE_CHECKING: | ||
|
@@ -52,3 +66,111 @@ def read_orc( | |
with get_handle(path, "rb", is_text=False) as handles: | ||
orc_file = orc.ORCFile(handles.handle) | ||
return orc_file.read(columns=columns, **kwargs).to_pandas() | ||
|
||
|
||
def to_orc( | ||
df: DataFrame, | ||
path: FilePath | WriteBuffer[bytes] | None = None, | ||
twoertwein marked this conversation as resolved.
Show resolved
Hide resolved
|
||
*, | ||
engine: Literal["pyarrow"] = "pyarrow", | ||
twoertwein marked this conversation as resolved.
Show resolved
Hide resolved
|
||
index: bool | None = None, | ||
engine_kwargs: dict[str, Any] | None = None, | ||
) -> bytes | None: | ||
""" | ||
Write a DataFrame to the ORC format. | ||
|
||
.. versionadded:: 1.5.0 | ||
|
||
Parameters | ||
---------- | ||
df : DataFrame | ||
The dataframe to be written to ORC. Raises NotImplementedError | ||
if dtype of one or more columns is category, unsigned integers, | ||
intervals, periods or sparse. | ||
path : str, file-like object or None, default None | ||
If a string, it will be used as Root Directory path | ||
when writing a partitioned dataset. By file-like object, | ||
we refer to objects with a write() method, such as a file handle | ||
(e.g. via builtin open function). If path is None, | ||
a bytes object is returned. | ||
engine : str, default 'pyarrow' | ||
ORC library to use. Pyarrow must be >= 7.0.0. | ||
index : bool, optional | ||
If ``True``, include the dataframe's index(es) in the file output. If | ||
``False``, they will not be written to the file. | ||
If ``None``, similar to ``infer`` the dataframe's index(es) | ||
will be saved. However, instead of being saved as values, | ||
the RangeIndex will be stored as a range in the metadata so it | ||
doesn't require much space and is faster. Other indexes will | ||
be included as columns in the file output. | ||
engine_kwargs : dict[str, Any] or None, default None | ||
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. | ||
|
||
Returns | ||
------- | ||
bytes if no path argument is provided else None | ||
|
||
Raises | ||
------ | ||
NotImplementedError | ||
Dtype of one or more columns is category, unsigned integers, interval, | ||
period or sparse. | ||
ValueError | ||
engine is not pyarrow. | ||
|
||
Notes | ||
----- | ||
* Before using this function you should read the | ||
:ref:`user guide about ORC <io.orc>` and | ||
:ref:`install optional dependencies <install.warn_orc>`. | ||
* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ | ||
library. | ||
* For supported dtypes please refer to `supported ORC features in Arrow | ||
<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. | ||
* Currently timezones in datetime columns are not preserved when a | ||
dataframe is converted into ORC files. | ||
""" | ||
if index is None: | ||
index = df.index.names[0] is not None | ||
if engine_kwargs is None: | ||
engine_kwargs = {} | ||
|
||
# If unsupported dtypes are found raise NotImplementedError | ||
# In Pyarrow 9.0.0 this check will no longer be needed | ||
for dtype in df.dtypes: | ||
if ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will pyarrow raise if these dtypes are passed? If so, can a a pyarrow error be caught and reraised as a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need to test these types individually. Not sure right now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mroeschke It seg faults out for all instances but sparse. I need to catch them in Arrow 9.0.0. Meanwhile can we use the current dtype filter? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, this is fine then given:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! Since for sparse dtypes we get a |
||
is_categorical_dtype(dtype) | ||
or is_interval_dtype(dtype) | ||
or is_period_dtype(dtype) | ||
or is_unsigned_integer_dtype(dtype) | ||
): | ||
raise NotImplementedError( | ||
"The dtype of one or more columns is not supported yet." | ||
) | ||
|
||
if engine != "pyarrow": | ||
raise ValueError("engine must be 'pyarrow'") | ||
engine = import_optional_dependency(engine, min_version="7.0.0") | ||
orc = import_optional_dependency("pyarrow.orc") | ||
|
||
was_none = path is None | ||
if was_none: | ||
path = io.BytesIO() | ||
assert path is not None # For mypy | ||
with get_handle(path, "wb", is_text=False) as handles: | ||
assert isinstance(engine, ModuleType) # For mypy | ||
try: | ||
orc.write_table( | ||
engine.Table.from_pandas(df, preserve_index=index), | ||
handles.handle, | ||
**engine_kwargs, | ||
) | ||
except TypeError as e: | ||
raise NotImplementedError( | ||
"The dtype of one or more columns is not supported yet." | ||
) from e | ||
|
||
if was_none: | ||
assert isinstance(path, io.BytesIO) # For mypy | ||
return path.getvalue() | ||
return None |
Uh oh!
There was an error while loading. Please reload this page.