From af9dc6f33a1d686fa69345164c1bbec641f1dd8a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 8 Nov 2023 18:21:40 +0100 Subject: [PATCH 1/6] Parquet/Feather IO: disable PyExtensionType autoload --- pandas/io/_util.py | 31 +++++++++++++++++++++++++++++++ pandas/io/feather_format.py | 9 ++++++++- pandas/io/parquet.py | 7 ++++++- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 3b2ae5daffdba..f816b624e83c3 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -32,3 +32,34 @@ def arrow_string_types_mapper() -> Callable: pa.string(): pd.StringDtype(storage="pyarrow_numpy"), pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), }.get + + +def patch_pyarrow(): + import pyarrow as pa + + if getattr(pa, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pa.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + "forbidden deserialization of 'arrow.py_extension_type': " + "storage_type = %s, serialized = %r, " + "pickle disassembly:\n%s" % (storage_type, serialized, out.getvalue()) + ) + + pa.unregister_extension_type("arrow.py_extension_type") + pa.register_extension_type( + ForbiddenExtensionType(pa.null(), "arrow.py_extension_type") + ) + + pa._hotfix_installed = True diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c463f6e4d2759..ffa01c597eb45 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -17,7 +17,10 @@ from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import ( + arrow_string_types_mapper, + patch_pyarrow, +) from pandas.io.common import get_handle if TYPE_CHECKING: @@ -60,6 +63,8 @@ def to_feather( import_optional_dependency("pyarrow") from pyarrow import feather + patch_pyarrow() + if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -117,6 +122,8 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + patch_pyarrow() + check_dtype_backend(dtype_backend) with get_handle( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 0785f14c6839d..261205d47141d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -29,7 +29,10 @@ ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import ( + arrow_string_types_mapper, + patch_pyarrow, +) from pandas.io.common import ( IOHandles, get_handle, @@ -168,6 +171,8 @@ def __init__(self) -> None: # import utils to register the pyarrow extension types import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 + patch_pyarrow() + self.api = pyarrow def write( From 737794965126c3e1dd5858870db43b3384d04113 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Nov 2023 14:19:47 +0100 Subject: [PATCH 2/6] don't install hotfix for pyarrow >= 14.0.1 --- pandas/compat/__init__.py | 1 + pandas/compat/pyarrow.py | 1 + pandas/io/_util.py | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index ea8cfb7cc144b..747634172511e 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -184,6 +184,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", + "pa_version_under14p1", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index d125904ba83f8..083be86a8e1ab 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -13,6 +13,7 @@ pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") except ImportError: pa_version_under10p1 = True pa_version_under11p0 = True diff --git a/pandas/io/_util.py b/pandas/io/_util.py index f816b624e83c3..8f87573db2b49 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -2,6 +2,7 @@ from typing import Callable +from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -35,6 +36,9 @@ def arrow_string_types_mapper() -> Callable: def patch_pyarrow(): + if pa_version_under14p1: + return + import pyarrow as pa if getattr(pa, "_hotfix_installed", False): From de19d14e2e2e075e6524b2ea200ea9fd37a98a28 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Nov 2023 14:42:44 +0100 Subject: [PATCH 3/6] move patching to extension type definitions --- pandas/compat/__init__.py | 1 + pandas/core/arrays/arrow/extension_types.py | 39 +++++++++++++++++++++ pandas/io/_util.py | 35 ------------------ pandas/io/feather_format.py | 10 ++---- pandas/io/parquet.py | 7 +--- 5 files changed, 44 insertions(+), 48 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 747634172511e..738442fab8c70 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -29,6 +29,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under14p1, ) if TYPE_CHECKING: diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 7814a77a1cdc5..02ce3de0ae76e 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,8 @@ import pyarrow +from pandas.compat import pa_version_under14p1 + from pandas.core.dtypes.dtypes import ( IntervalDtype, PeriodDtype, @@ -112,3 +114,40 @@ def to_pandas_dtype(self) -> IntervalDtype: # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + "forbidden deserialization of 'arrow.py_extension_type': " + "storage_type = %s, serialized = %r, " + "pickle disassembly:\n%s" % (storage_type, serialized, out.getvalue()) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 8f87573db2b49..3b2ae5daffdba 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -2,7 +2,6 @@ from typing import Callable -from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -33,37 +32,3 @@ def arrow_string_types_mapper() -> Callable: pa.string(): pd.StringDtype(storage="pyarrow_numpy"), pa.large_string(): pd.StringDtype(storage="pyarrow_numpy"), }.get - - -def patch_pyarrow(): - if pa_version_under14p1: - return - - import pyarrow as pa - - if getattr(pa, "_hotfix_installed", False): - return - - class ForbiddenExtensionType(pa.ExtensionType): - def __arrow_ext_serialize__(self): - return b"" - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - import io - import pickletools - - out = io.StringIO() - pickletools.dis(serialized, out) - raise RuntimeError( - "forbidden deserialization of 'arrow.py_extension_type': " - "storage_type = %s, serialized = %r, " - "pickle disassembly:\n%s" % (storage_type, serialized, out.getvalue()) - ) - - pa.unregister_extension_type("arrow.py_extension_type") - pa.register_extension_type( - ForbiddenExtensionType(pa.null(), "arrow.py_extension_type") - ) - - pa._hotfix_installed = True diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index ffa01c597eb45..c451cd6c139ed 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -17,10 +17,7 @@ from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import ( - arrow_string_types_mapper, - patch_pyarrow, -) +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import get_handle if TYPE_CHECKING: @@ -63,8 +60,6 @@ def to_feather( import_optional_dependency("pyarrow") from pyarrow import feather - patch_pyarrow() - if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -122,7 +117,8 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - patch_pyarrow() + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 check_dtype_backend(dtype_backend) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 261205d47141d..0785f14c6839d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -29,10 +29,7 @@ ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import ( - arrow_string_types_mapper, - patch_pyarrow, -) +from pandas.io._util import arrow_string_types_mapper from pandas.io.common import ( IOHandles, get_handle, @@ -171,8 +168,6 @@ def __init__(self) -> None: # import utils to register the pyarrow extension types import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401 - patch_pyarrow() - self.api = pyarrow def write( From 312931f3fb1cd50ce04ca3a2c5b18121902c1ff3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Nov 2023 15:06:06 +0100 Subject: [PATCH 4/6] expand error message --- pandas/core/arrays/arrow/extension_types.py | 27 ++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 02ce3de0ae76e..72bfd6f2212f8 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -116,6 +116,25 @@ def to_pandas_dtype(self) -> IntervalDtype: pyarrow.register_extension_type(_interval_type) +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + def patch_pyarrow(): # starting from pyarrow 14.0.1, it has its own mechanism if not pa_version_under14p1: @@ -137,9 +156,11 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): out = io.StringIO() pickletools.dis(serialized, out) raise RuntimeError( - "forbidden deserialization of 'arrow.py_extension_type': " - "storage_type = %s, serialized = %r, " - "pickle disassembly:\n%s" % (storage_type, serialized, out.getvalue()) + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) ) pyarrow.unregister_extension_type("arrow.py_extension_type") From b7481af5045ba018c19af7dc991c158df146c4db Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Nov 2023 16:09:50 +0100 Subject: [PATCH 5/6] fix compat for pyarrow not installed --- pandas/compat/pyarrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 083be86a8e1ab..8dcb2669aa663 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -20,3 +20,4 @@ pa_version_under12p0 = True pa_version_under13p0 = True pa_version_under14p0 = True + pa_version_under14p1 = True From ece6eee2fae6553424b9b6f49c09112399261aa1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Nov 2023 17:12:54 +0100 Subject: [PATCH 6/6] add whatsnew --- doc/source/whatsnew/v2.1.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 31ab01f171b4a..ac70623bdcd11 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -23,6 +23,7 @@ Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) +- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: