diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7e8403c94ceef..933709ce2cde8 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -183,7 +183,7 @@ MultiIndex I/O ^^^ -- +- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - Period diff --git a/pandas/io/orc.py b/pandas/io/orc.py index b1bb4cef73c33..72423473e019f 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -21,6 +21,7 @@ ) import pandas as pd +from pandas.core.indexes.api import default_index from pandas.io.common import ( get_handle, @@ -190,6 +191,21 @@ def to_orc( if engine_kwargs is None: engine_kwargs = {} + # validate index + # -------------- + + # validate that we have only a default index + # raise on anything else as we don't serialize the index + + if not df.index.equals(default_index(len(df))): + raise ValueError( + "orc does not support serializing a non-default index for the index; " + "you can .reset_index() to make the index into column(s)" + ) + + if df.index.name is not None: + raise ValueError("orc does not serialize index meta-data on a default index") + # If unsupported dtypes are found raise NotImplementedError # In Pyarrow 8.0.0 this check will no longer be needed if pa_version_under8p0: diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 35df047915255..dccdfdc897dc1 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -391,3 +391,21 @@ def test_orc_uri_path(): uri = pathlib.Path(path).as_uri() result = read_orc(uri) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(start=2, stop=5, step=1), + pd.RangeIndex(start=0, stop=3, step=1, name="non-default"), + pd.Index([1, 2, 3]), + ], +) +def test_to_orc_non_default_index(index): + df = pd.DataFrame({"a": [1, 2, 3]}, index=index) + msg = ( + "orc does not support serializing a non-default index|" + "orc does not serialize index meta-data" + ) + with pytest.raises(ValueError, match=msg): + df.to_orc()