From fe849207cd177334f0d5533d163915b1ffe97eff Mon Sep 17 00:00:00 2001 From: NickFillot <40593450+NickFillot@users.noreply.github.com> Date: Sun, 3 Oct 2021 16:23:02 +0200 Subject: [PATCH 1/3] [ENH] to_orc pandas.io.orc.to_orc method definition --- pandas/io/orc.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 6bdb4df806b5c..8900f30f6a440 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,15 +1,19 @@ """ orc compat """ from __future__ import annotations +import os from typing import TYPE_CHECKING +from tempfile import gettempdir from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency from pandas.io.common import get_handle -if TYPE_CHECKING: - from pandas import DataFrame +from pandas.core import generic +from pandas.util._decorators import doc + +from pandas import DataFrame def read_orc( @@ -55,3 +59,78 @@ def read_orc( with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() + + +def to_orc( + df: DataFrame, + path: FilePathOrBuffer = None, + engine: str = 'pyarrow', + index: bool = None, + **kwargs +) -> bytes: + """ + Write a DataFrame to the orc/arrow format. + Parameters + ---------- + df : DataFrame + path : str or file-like object, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. + engine : {{'pyarrow'}}, default 'pyarrow' + Parquet library to use, or library it self, checked with 'pyarrow' name + and version > 4.0.0 + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + kwargs + Additional keyword arguments passed to the engine + Returns + ------- + bytes if no path argument is provided else None + """ + if index is None: + index = df.index.names[0] is not None + + if isinstance(engine, str): + engine = import_optional_dependency(engine, min_version='4.0.0') + else: + try: + assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" + assert hasattr(engine, 'orc'), "'pyarrow' module must have version > 4.0.0 with orc module" + except Exception as e: + raise ValueError("Wrong engine passed, %s" % ( + e, + )) + + if path is None: + # to bytes: tmp path, pyarrow auto closes buffers + path = os.path.join(gettempdir(), os.urandom(12).hex()) + try: + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + path, **kwargs + ) + with open(path, 'rb') as path: + return path.read() + except BaseException as e: + raise e + finally: + try: + os.remove(path) + except Exception as e: + pass + else: + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + path, **kwargs + ) + return From 6cc7030cb23511aeaf803a69fde89040e9fc6ae4 Mon Sep 17 00:00:00 2001 From: NickFillot <40593450+NickFillot@users.noreply.github.com> Date: Sun, 3 Oct 2021 16:34:37 +0200 Subject: [PATCH 2/3] pandas.DataFrame.to_orc set to_orc to pandas.DataFrame --- pandas/core/frame.py | 74 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aad7213c93a1d..e52ef00c348d6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2734,7 +2734,81 @@ def to_parquet( storage_options=storage_options, **kwargs, ) + + def to_orc( + self, + path: FilePathOrBuffer = None, + engine: str = 'pyarrow', + index: bool = None, + **kwargs + ) -> bytes: + """ + Write a DataFrame to the orc/arrow format. + Parameters + ---------- + df : DataFrame + path : str or file-like object, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. + engine : {{'pyarrow'}}, default 'pyarrow' + Parquet library to use, or library it self, checked with 'pyarrow' name + and version > 4.0.0 + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + kwargs + Additional keyword arguments passed to the engine + Returns + ------- + bytes if no path argument is provided else None + See Also + -------- + read_orc : Read a ORC file. + DataFrame.to_parquet : Write a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + This function requires `pyarrow `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df.to_orc('df.orc', compression='gzip') # doctest: +SKIP + >>> pd.read_orc('df.orc') # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + If you want to get a buffer to the orc content you can write it to io.BytesIO + >>> import io + >>> b = io.BytesIO(df.to_orc()) + >>> b.seek(0) + 0 + >>> content = b.read() + """ + from pandas.io.orc import to_orc + + return to_orc( + self, + path, + engine, + index=index, + **kwargs + ) + @Substitution( header_type="bool", header="Whether to print column labels, default True", From 2d1515eb1ca49eb08539add28db2467a75911905 Mon Sep 17 00:00:00 2001 From: NickFillot <40593450+NickFillot@users.noreply.github.com> Date: Sun, 3 Oct 2021 16:47:11 +0200 Subject: [PATCH 3/3] Cleaning --- pandas/io/orc.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 8900f30f6a440..d444d38aa2486 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -2,6 +2,8 @@ from __future__ import annotations import os +import pandas._testing as tm + from typing import TYPE_CHECKING from tempfile import gettempdir @@ -10,10 +12,8 @@ from pandas.io.common import get_handle -from pandas.core import generic -from pandas.util._decorators import doc - -from pandas import DataFrame +if TYPE_CHECKING: + from pandas import DataFrame def read_orc( @@ -105,29 +105,19 @@ def to_orc( else: try: assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" - assert hasattr(engine, 'orc'), "'pyarrow' module must have version > 4.0.0 with orc module" + assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" except Exception as e: - raise ValueError("Wrong engine passed, %s" % ( - e, - )) + raise ValueError("Wrong engine passed, %s" % e) if path is None: # to bytes: tmp path, pyarrow auto closes buffers - path = os.path.join(gettempdir(), os.urandom(12).hex()) - try: + with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path: engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index), path, **kwargs ) with open(path, 'rb') as path: return path.read() - except BaseException as e: - raise e - finally: - try: - os.remove(path) - except Exception as e: - pass else: engine.orc.write_table( engine.Table.from_pandas(df, preserve_index=index),