diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5d36c52da9f0d..d37aaaf1fb040 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -241,6 +241,10 @@ Other enhancements - Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) +- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use + nullable dtypes that use ``pd.NA`` as missing value indicator where possible + for the resulting DataFrame (default is False, and only applicable for + ``engine="pyarrow"``) (:issue:`31242`) - Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`) - :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a19b132a7891d..8b1184df92eaf 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,5 +1,6 @@ """ parquet compat """ +from distutils.version import LooseVersion import io import os from typing import Any, AnyStr, Dict, List, Optional, Tuple @@ -177,10 +178,39 @@ def write( handles.close() def read( - self, path, columns=None, storage_options: StorageOptions = None, **kwargs + self, + path, + columns=None, + use_nullable_dtypes=False, + storage_options: StorageOptions = None, + **kwargs, ): kwargs["use_pandas_metadata"] = True + to_pandas_kwargs = {} + if use_nullable_dtypes: + if LooseVersion(self.api.__version__) >= "0.16": + import pandas as pd + + mapping = { + self.api.int8(): pd.Int8Dtype(), + self.api.int16(): pd.Int16Dtype(), + self.api.int32(): pd.Int32Dtype(), + self.api.int64(): pd.Int64Dtype(), + self.api.uint8(): pd.UInt8Dtype(), + self.api.uint16(): pd.UInt16Dtype(), + self.api.uint32(): pd.UInt32Dtype(), + self.api.uint64(): pd.UInt64Dtype(), + self.api.bool_(): pd.BooleanDtype(), + self.api.string(): pd.StringDtype(), + } + to_pandas_kwargs["types_mapper"] = mapping.get + else: + raise ValueError( + "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " + f"({self.api.__version__} is installed" + ) + path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), @@ -190,7 +220,7 @@ def read( try: return self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs - ).to_pandas() + ).to_pandas(**to_pandas_kwargs) finally: if handles is not None: handles.close() @@ -258,6 +288,12 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine" + ) path = stringify_path(path) parquet_kwargs = {} handles = None @@ -368,7 +404,13 @@ def to_parquet( return None -def read_parquet(path, engine: str = "auto", columns=None, **kwargs): +def read_parquet( + path, + engine: str = "auto", + columns=None, + use_nullable_dtypes: bool = False, + **kwargs, +): """ Load a parquet object from the file path, returning a DataFrame. @@ -397,6 +439,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): 'pyarrow' is unavailable. columns : list, default=None If not None, only these columns will be read from the file. + use_nullable_dtypes : bool, default False + If True, use dtypes that use ``pd.NA`` as missing value indicator + for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + As new dtypes are added that support ``pd.NA`` in the future, the + output with this option will change to use those dtypes. + Note: this is an experimental option, and behaviour (e.g. additional + support dtypes) may change without notice. + + .. versionadded:: 1.2.0 **kwargs Any additional kwargs are passed to the engine. @@ -405,4 +456,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): DataFrame """ impl = get_engine(engine) - return impl.read(path, columns=columns, **kwargs) + return impl.read( + path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs + ) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3b83eed69c723..7e1d7fb17c8ed 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -828,6 +828,35 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) + @td.skip_if_no("pyarrow", min_version="0.16") + def test_use_nullable_dtypes(self, pa): + import pyarrow.parquet as pq + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path) + result2 = read_parquet(path, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), + } + ) + tm.assert_frame_equal(result2, expected) + @td.skip_if_no("pyarrow", min_version="0.14") def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so @@ -1001,3 +1030,11 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected = df.copy() expected.index.name = "index" check_round_trip(df, fp, expected=expected) + + def test_use_nullable_dtypes_not_supported(self, fp): + df = pd.DataFrame({"a": [1, 2]}) + + with tm.ensure_clean() as path: + df.to_parquet(path) + with pytest.raises(ValueError, match="not supported for the fastparquet"): + read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)