From 39574c2f96415d3870a79f46dca4e2d4d28bec0e Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Fri, 13 Dec 2019 17:38:29 -0600 Subject: [PATCH 1/9] schema as optional arg --- pandas/io/parquet.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a044cfcdf6a01..7f9349a3645d2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -96,6 +96,10 @@ def write( from_pandas_kwargs = {} else: from_pandas_kwargs = {"preserve_index": index} + + if 'schema' in kwargs: + from_pandas_kwargs['schema'] = kwargs.pop('schema') + table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( From 1ffd4fd4284896af4edcc10f83213ec75eb6c5a1 Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Fri, 13 Dec 2019 17:40:17 -0600 Subject: [PATCH 2/9] black --- pandas/io/parquet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7f9349a3645d2..0621c02ce68cb 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -96,9 +96,9 @@ def write( from_pandas_kwargs = {} else: from_pandas_kwargs = {"preserve_index": index} - - if 'schema' in kwargs: - from_pandas_kwargs['schema'] = kwargs.pop('schema') + + if "schema" in kwargs: + from_pandas_kwargs["schema"] = kwargs.pop("schema") table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: From 8b5eba3b6904446194c9aa250d107e8c6e7e5baf Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Sat, 14 Dec 2019 19:29:58 -0600 Subject: [PATCH 3/9] test --- pandas/tests/io/test_parquet.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a98c93c250070..05be843b0fa2c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -504,6 +504,13 @@ def test_empty_dataframe(self, pa): df = pd.DataFrame() check_round_trip(df, pa) + def test_write_with_schema(self, pa): + import pyarrow + df = pd.DataFrame({'x': [0, 1]}) + schema = pyarrow.schema([pyarrow.field('x', type=pyarrow.bool_())]) + out_df = df.astype(bool) + check_round_trip(df, pa, write_kwargs={'schema': schema}, expected=out_df) + @pytest.mark.skip(reason="broken test") @td.skip_if_no("pyarrow", min_version="0.15.0") def test_additional_extension_arrays(self, pa): From 96c2b74a782d5e2ea8ab351e66d8a4198c74245f Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Sat, 14 Dec 2019 19:38:39 -0600 Subject: [PATCH 4/9] whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cd11a15e74b1f..469f445b901cd 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -792,7 +792,7 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) -- +- :meth: `to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) Plotting ^^^^^^^^ From 0c3f6518a88eac122571bc6ec810993a5c5030a1 Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Sat, 14 Dec 2019 19:58:50 -0600 Subject: [PATCH 5/9] black --- pandas/tests/io/test_parquet.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 05be843b0fa2c..45e5131d059d2 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -506,10 +506,11 @@ def test_empty_dataframe(self, pa): def test_write_with_schema(self, pa): import pyarrow - df = pd.DataFrame({'x': [0, 1]}) - schema = pyarrow.schema([pyarrow.field('x', type=pyarrow.bool_())]) + + df = pd.DataFrame({"x": [0, 1]}) + schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())]) out_df = df.astype(bool) - check_round_trip(df, pa, write_kwargs={'schema': schema}, expected=out_df) + check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) @pytest.mark.skip(reason="broken test") @td.skip_if_no("pyarrow", min_version="0.15.0") From 96e411ddba8f032734f9d91bf6462aac06f18535 Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Sun, 15 Dec 2019 17:06:08 -0600 Subject: [PATCH 6/9] whatsnew & simplification --- doc/source/whatsnew/v1.0.0.rst | 3 +-- pandas/io/parquet.py | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 469f445b901cd..04dbc39f4ba36 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -792,7 +792,6 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) -- :meth: `to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) Plotting ^^^^^^^^ @@ -873,7 +872,7 @@ Other years after 2030 (now goes up to 2200) (:issue:`27790`) - Fixed :class:`IntegerArray` returning ``NA`` rather than ``inf`` for operations dividing by 0 (:issue:`27398`) - Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`) - +- :meth: `to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) .. _whatsnew_1000.contributors: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 0621c02ce68cb..7e275228d0e9f 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -91,15 +91,12 @@ def write( self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode="wb") - from_pandas_kwargs: Dict[str, Any] + from_pandas_kwargs: Dict[str, Any] = {schema: kwargs.pop("schema", None)} if index is None: from_pandas_kwargs = {} else: from_pandas_kwargs = {"preserve_index": index} - if "schema" in kwargs: - from_pandas_kwargs["schema"] = kwargs.pop("schema") - table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( From 7a1660160736dbf365b2b03f28c3f0a74dd17c36 Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Sun, 15 Dec 2019 18:36:58 -0600 Subject: [PATCH 7/9] quotes --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7e275228d0e9f..6350455cf12e2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -91,7 +91,7 @@ def write( self.validate_dataframe(df) path, _, _, _ = get_filepath_or_buffer(path, mode="wb") - from_pandas_kwargs: Dict[str, Any] = {schema: kwargs.pop("schema", None)} + from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)} if index is None: from_pandas_kwargs = {} else: From defcb9f8d33e11d40b5c1952a6330b4d49cc4f5f Mon Sep 17 00:00:00 2001 From: Ian Eaves Date: Sun, 15 Dec 2019 19:19:55 -0600 Subject: [PATCH 8/9] was not paying attention --- pandas/io/parquet.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6350455cf12e2..be1e97ec49df5 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -92,10 +92,8 @@ def write( path, _, _, _ = get_filepath_or_buffer(path, mode="wb") from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)} - if index is None: - from_pandas_kwargs = {} - else: - from_pandas_kwargs = {"preserve_index": index} + if index is not None: + from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: From 9cb2332ef54d3010e88994349e1ff21dba3d4736 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 18 Dec 2019 08:55:14 +0100 Subject: [PATCH 9/9] move to enhancements --- doc/source/whatsnew/v1.0.0.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c10d3c91a7821..17818118e587d 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -205,6 +205,8 @@ Other enhancements (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) +- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) + Build Changes ^^^^^^^^^^^^^ @@ -879,7 +881,7 @@ Other - Fixed :class:`IntegerArray` returning ``inf`` rather than ``NaN`` for operations dividing by 0 (:issue:`27398`) - Fixed ``pow`` operations for :class:`IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`) - Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`) -- :meth: `to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) + .. _whatsnew_1000.contributors: