From b3053fd2bd107c8a7ec450b4ed1a81bc87b12ccb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2020 11:28:38 +0100 Subject: [PATCH 1/6] ENH: add use_nullable_dtypes option in read_parquet --- pandas/io/parquet.py | 51 +++++++++++++++++++++++++++++---- pandas/tests/io/test_parquet.py | 25 ++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 98f2eb3929b59..46fab5104a25d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,5 +1,6 @@ """ parquet compat """ +from distutils.version import LooseVersion from typing import Any, Dict, Optional from warnings import catch_warnings @@ -116,13 +117,32 @@ def write( **kwargs, ) - def read(self, path, columns=None, **kwargs): + def read(self, path, columns=None, use_nullable_dtypes=False, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) kwargs["use_pandas_metadata"] = True - result = self.api.parquet.read_table( - path, columns=columns, **kwargs - ).to_pandas() + to_pandas_kwargs = {} + if use_nullable_dtypes: + if LooseVersion(self.api.__version__) > "0.15.1.dev": + import pandas as pd + + mapping = { + self.api.int8(): pd.Int8Dtype(), + self.api.int16(): pd.Int16Dtype(), + self.api.int32(): pd.Int32Dtype(), + self.api.int64(): pd.Int64Dtype(), + self.api.uint8(): pd.UInt8Dtype(), + self.api.uint16(): pd.UInt16Dtype(), + self.api.uint32(): pd.UInt32Dtype(), + self.api.uint64(): pd.UInt64Dtype(), + self.api.bool_(): pd.BooleanDtype(), + self.api.string(): pd.StringDtype(), + } + to_pandas_kwargs["types_mapper"] = mapping.get + + result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas( + **to_pandas_kwargs + ) if should_close: path.close() @@ -184,6 +204,12 @@ def write( ) def read(self, path, columns=None, **kwargs): + use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine" + ) if is_s3_url(path): from pandas.io.s3 import get_file_and_filesystem @@ -263,7 +289,13 @@ def to_parquet( ) -def read_parquet(path, engine: str = "auto", columns=None, **kwargs): +def read_parquet( + path, + engine: str = "auto", + columns=None, + use_nullable_dtypes: bool = False, + **kwargs, +): """ Load a parquet object from the file path, returning a DataFrame. @@ -296,6 +328,11 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): If not None, only these columns will be read from the file. .. versionadded:: 0.21.1 + use_nullable_dtypes : bool, default False + If True, use dtypes that use ``pd.NA`` as missing value indicator + for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + As new dtypes are added that support ``pd.NA`` in the future, the + output with this option will change to use those dtypes. **kwargs Any additional kwargs are passed to the engine. @@ -305,4 +342,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): """ impl = get_engine(engine) - return impl.read(path, columns=columns, **kwargs) + return impl.read( + path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs + ) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d51c712ed5abd..1467f6e88d132 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -564,6 +564,31 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) + @td.skip_if_no("pyarrow", min_version="0.15.1.dev") + def test_use_nullable_dtypes(self, pa): + import pyarrow.parquet as pq + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array(["a", "b", "c", None]), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path) + result2 = read_parquet(path, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array(["a", "b", "c", None], dtype="string"), + } + ) + tm.assert_frame_equal(result2, expected) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.3.2") From f617a7e57c9b777fb88194f9920fb178319c6652 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Sep 2020 13:45:01 +0200 Subject: [PATCH 2/6] add message for old versions + test also uint/bool --- pandas/io/parquet.py | 5 +++++ pandas/tests/io/test_parquet.py | 8 ++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6c91d927592bd..bb9524746e88b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -175,6 +175,11 @@ def read( self.api.string(): pd.StringDtype(), } to_pandas_kwargs["types_mapper"] = mapping.get + else: + raise ValueError( + "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16" + f" ({self.api.__version__} is installed" + ) result = self.api.parquet.read_table( path, columns=columns, filesystem=fs, **kwargs diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a2dce173dbf6e..ad9df89a3a03f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -712,7 +712,9 @@ def test_use_nullable_dtypes(self, pa): table = pyarrow.table( { "a": pyarrow.array([1, 2, 3, None], "int64"), - "b": pyarrow.array(["a", "b", "c", None]), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), } ) with tm.ensure_clean() as path: @@ -725,7 +727,9 @@ def test_use_nullable_dtypes(self, pa): expected = pd.DataFrame( { "a": pd.array([1, 2, 3, None], dtype="Int64"), - "b": pd.array(["a", "b", "c", None], dtype="string"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), } ) tm.assert_frame_equal(result2, expected) From 60a1c0e0431699df48b12b2389d1172b4e8b63d8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 29 Sep 2020 09:15:00 +0200 Subject: [PATCH 3/6] lint --- pandas/io/parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6bb2c0ea10321..2698fa4ee1ca2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -177,8 +177,8 @@ def read( to_pandas_kwargs["types_mapper"] = mapping.get else: raise ValueError( - "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16" - f" ({self.api.__version__} is installed" + "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " + f"({self.api.__version__} is installed" ) result = self.api.parquet.read_table( From 18c93b56290f23c89c3b00821cc682514b4078fe Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 29 Sep 2020 09:57:06 +0200 Subject: [PATCH 4/6] add whatsnew note --- doc/source/whatsnew/v1.2.0.rst | 5 +++++ pandas/io/parquet.py | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 031c74b1cc367..bfa00a376619c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -120,6 +120,11 @@ Other enhancements - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use + nullable dtypes that use ``pd.NA`` as missing value indicator where possible + for the resulting DataFrame (default is False, and only applicable for + ``engine="pyarrow"``) (:issue:`31242`) + .. _whatsnew_120.api_breaking.python: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 2698fa4ee1ca2..c1559677efe9b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -384,8 +384,6 @@ def read_parquet( 'pyarrow' is unavailable. columns : list, default=None If not None, only these columns will be read from the file. - - .. versionadded:: 0.21.1 use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator for the resulting DataFrame (only applicable for ``engine="pyarrow"``). From 46932f4b2e7ed8ca38eed732b0eb164334b4a66b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 28 Nov 2020 20:42:41 +0100 Subject: [PATCH 5/6] update version --- pandas/io/parquet.py | 2 +- pandas/tests/io/test_parquet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 28e89a7d8ddef..8b1184df92eaf 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -189,7 +189,7 @@ def read( to_pandas_kwargs = {} if use_nullable_dtypes: - if LooseVersion(self.api.__version__) > "0.15.1.dev": + if LooseVersion(self.api.__version__) >= "0.16": import pandas as pd mapping = { diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 38603fe5c2132..d13679f252b37 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -828,7 +828,7 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow", min_version="0.15.1.dev") + @td.skip_if_no("pyarrow", min_version="0.16") def test_use_nullable_dtypes(self, pa): import pyarrow.parquet as pq From 1375bad8bcf7967c2aed0a3aad3a7343554cdaec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 28 Nov 2020 20:49:42 +0100 Subject: [PATCH 6/6] add fastparquet test ensuring an error --- pandas/tests/io/test_parquet.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d13679f252b37..7e1d7fb17c8ed 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1030,3 +1030,11 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected = df.copy() expected.index.name = "index" check_round_trip(df, fp, expected=expected) + + def test_use_nullable_dtypes_not_supported(self, fp): + df = pd.DataFrame({"a": [1, 2]}) + + with tm.ensure_clean() as path: + df.to_parquet(path) + with pytest.raises(ValueError, match="not supported for the fastparquet"): + read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)