From b3053fd2bd107c8a7ec450b4ed1a81bc87b12ccb Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 23 Jan 2020 11:28:38 +0100
Subject: [PATCH 1/6] ENH: add use_nullable_dtypes option in read_parquet

---
 pandas/io/parquet.py            | 51 +++++++++++++++++++++++++++++----
 pandas/tests/io/test_parquet.py | 25 ++++++++++++++++
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 98f2eb3929b59..46fab5104a25d 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -1,5 +1,6 @@
 """ parquet compat """
 
+from distutils.version import LooseVersion
 from typing import Any, Dict, Optional
 from warnings import catch_warnings
 
@@ -116,13 +117,32 @@ def write(
                 **kwargs,
             )
 
-    def read(self, path, columns=None, **kwargs):
+    def read(self, path, columns=None, use_nullable_dtypes=False, **kwargs):
         path, _, _, should_close = get_filepath_or_buffer(path)
 
         kwargs["use_pandas_metadata"] = True
-        result = self.api.parquet.read_table(
-            path, columns=columns, **kwargs
-        ).to_pandas()
+        to_pandas_kwargs = {}
+        if use_nullable_dtypes:
+            if LooseVersion(self.api.__version__) > "0.15.1.dev":
+                import pandas as pd
+
+                mapping = {
+                    self.api.int8(): pd.Int8Dtype(),
+                    self.api.int16(): pd.Int16Dtype(),
+                    self.api.int32(): pd.Int32Dtype(),
+                    self.api.int64(): pd.Int64Dtype(),
+                    self.api.uint8(): pd.UInt8Dtype(),
+                    self.api.uint16(): pd.UInt16Dtype(),
+                    self.api.uint32(): pd.UInt32Dtype(),
+                    self.api.uint64(): pd.UInt64Dtype(),
+                    self.api.bool_(): pd.BooleanDtype(),
+                    self.api.string(): pd.StringDtype(),
+                }
+                to_pandas_kwargs["types_mapper"] = mapping.get
+
+        result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas(
+            **to_pandas_kwargs
+        )
         if should_close:
             path.close()
 
@@ -184,6 +204,12 @@ def write(
             )
 
     def read(self, path, columns=None, **kwargs):
+        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
+        if use_nullable_dtypes:
+            raise ValueError(
+                "The 'use_nullable_dtypes' argument is not supported for the "
+                "fastparquet engine"
+            )
         if is_s3_url(path):
             from pandas.io.s3 import get_file_and_filesystem
 
@@ -263,7 +289,13 @@ def to_parquet(
     )
 
 
-def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
+def read_parquet(
+    path,
+    engine: str = "auto",
+    columns=None,
+    use_nullable_dtypes: bool = False,
+    **kwargs,
+):
     """
     Load a parquet object from the file path, returning a DataFrame.
 
@@ -296,6 +328,11 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
         If not None, only these columns will be read from the file.
 
         .. versionadded:: 0.21.1
+    use_nullable_dtypes : bool, default False
+        If True, use dtypes that use ``pd.NA`` as missing value indicator
+        for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
+        As new dtypes are added that support ``pd.NA`` in the future, the
+        output with this option will change to use those dtypes.
     **kwargs
         Any additional kwargs are passed to the engine.
 
@@ -305,4 +342,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
     """
 
     impl = get_engine(engine)
-    return impl.read(path, columns=columns, **kwargs)
+    return impl.read(
+        path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
+    )
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index d51c712ed5abd..1467f6e88d132 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -564,6 +564,31 @@ def test_additional_extension_types(self, pa):
         )
         check_round_trip(df, pa)
 
+    @td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+    def test_use_nullable_dtypes(self, pa):
+        import pyarrow.parquet as pq
+
+        table = pyarrow.table(
+            {
+                "a": pyarrow.array([1, 2, 3, None], "int64"),
+                "b": pyarrow.array(["a", "b", "c", None]),
+            }
+        )
+        with tm.ensure_clean() as path:
+            # write manually with pyarrow to write integers
+            pq.write_table(table, path)
+            result1 = read_parquet(path)
+            result2 = read_parquet(path, use_nullable_dtypes=True)
+
+        assert result1["a"].dtype == np.dtype("float64")
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, None], dtype="Int64"),
+                "b": pd.array(["a", "b", "c", None], dtype="string"),
+            }
+        )
+        tm.assert_frame_equal(result2, expected)
+
 
 class TestParquetFastParquet(Base):
     @td.skip_if_no("fastparquet", min_version="0.3.2")

From f617a7e57c9b777fb88194f9920fb178319c6652 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 4 Sep 2020 13:45:01 +0200
Subject: [PATCH 2/6] add message for old versions + test also uint/bool

---
 pandas/io/parquet.py            | 5 +++++
 pandas/tests/io/test_parquet.py | 8 ++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 6c91d927592bd..bb9524746e88b 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -175,6 +175,11 @@ def read(
                     self.api.string(): pd.StringDtype(),
                 }
                 to_pandas_kwargs["types_mapper"] = mapping.get
+            else:
+                raise ValueError(
+                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16"
+                    f" ({self.api.__version__} is installed"
+                )
 
         result = self.api.parquet.read_table(
             path, columns=columns, filesystem=fs, **kwargs
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index a2dce173dbf6e..ad9df89a3a03f 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -712,7 +712,9 @@ def test_use_nullable_dtypes(self, pa):
         table = pyarrow.table(
             {
                 "a": pyarrow.array([1, 2, 3, None], "int64"),
-                "b": pyarrow.array(["a", "b", "c", None]),
+                "b": pyarrow.array([1, 2, 3, None], "uint8"),
+                "c": pyarrow.array(["a", "b", "c", None]),
+                "d": pyarrow.array([True, False, True, None]),
             }
         )
         with tm.ensure_clean() as path:
@@ -725,7 +727,9 @@ def test_use_nullable_dtypes(self, pa):
         expected = pd.DataFrame(
             {
                 "a": pd.array([1, 2, 3, None], dtype="Int64"),
-                "b": pd.array(["a", "b", "c", None], dtype="string"),
+                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
+                "c": pd.array(["a", "b", "c", None], dtype="string"),
+                "d": pd.array([True, False, True, None], dtype="boolean"),
             }
         )
         tm.assert_frame_equal(result2, expected)

From 60a1c0e0431699df48b12b2389d1172b4e8b63d8 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 29 Sep 2020 09:15:00 +0200
Subject: [PATCH 3/6] lint

---
 pandas/io/parquet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 6bb2c0ea10321..2698fa4ee1ca2 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -177,8 +177,8 @@ def read(
                 to_pandas_kwargs["types_mapper"] = mapping.get
             else:
                 raise ValueError(
-                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16"
-                    f" ({self.api.__version__} is installed"
+                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
+                    f"({self.api.__version__} is installed"
                 )
 
         result = self.api.parquet.read_table(

From 18c93b56290f23c89c3b00821cc682514b4078fe Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 29 Sep 2020 09:57:06 +0200
Subject: [PATCH 4/6] add whatsnew note

---
 doc/source/whatsnew/v1.2.0.rst | 5 +++++
 pandas/io/parquet.py           | 2 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 031c74b1cc367..bfa00a376619c 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -120,6 +120,11 @@ Other enhancements
 - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
 - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
 - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`)
+- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use
+  nullable dtypes that use ``pd.NA`` as missing value indicator where possible
+  for the resulting DataFrame (default is False, and only applicable for
+  ``engine="pyarrow"``) (:issue:`31242`)
+
 
 .. _whatsnew_120.api_breaking.python:
 
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 2698fa4ee1ca2..c1559677efe9b 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -384,8 +384,6 @@ def read_parquet(
         'pyarrow' is unavailable.
     columns : list, default=None
         If not None, only these columns will be read from the file.
-
-        .. versionadded:: 0.21.1
     use_nullable_dtypes : bool, default False
         If True, use dtypes that use ``pd.NA`` as missing value indicator
         for the resulting DataFrame (only applicable for ``engine="pyarrow"``).

From 46932f4b2e7ed8ca38eed732b0eb164334b4a66b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 28 Nov 2020 20:42:41 +0100
Subject: [PATCH 5/6] update version

---
 pandas/io/parquet.py            | 2 +-
 pandas/tests/io/test_parquet.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 28e89a7d8ddef..8b1184df92eaf 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -189,7 +189,7 @@ def read(
 
         to_pandas_kwargs = {}
         if use_nullable_dtypes:
-            if LooseVersion(self.api.__version__) > "0.15.1.dev":
+            if LooseVersion(self.api.__version__) >= "0.16":
                 import pandas as pd
 
                 mapping = {
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 38603fe5c2132..d13679f252b37 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -828,7 +828,7 @@ def test_additional_extension_types(self, pa):
         )
         check_round_trip(df, pa)
 
-    @td.skip_if_no("pyarrow", min_version="0.15.1.dev")
+    @td.skip_if_no("pyarrow", min_version="0.16")
     def test_use_nullable_dtypes(self, pa):
         import pyarrow.parquet as pq
 

From 1375bad8bcf7967c2aed0a3aad3a7343554cdaec Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 28 Nov 2020 20:49:42 +0100
Subject: [PATCH 6/6] add fastparquet test ensuring an error

---
 pandas/tests/io/test_parquet.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index d13679f252b37..7e1d7fb17c8ed 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -1030,3 +1030,11 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
         expected = df.copy()
         expected.index.name = "index"
         check_round_trip(df, fp, expected=expected)
+
+    def test_use_nullable_dtypes_not_supported(self, fp):
+        df = pd.DataFrame({"a": [1, 2]})
+
+        with tm.ensure_clean() as path:
+            df.to_parquet(path)
+            with pytest.raises(ValueError, match="not supported for the fastparquet"):
+                read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)