From 613d51b11234e5c426abf1326067d0a6aafd33d1 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 19 Aug 2020 23:13:51 +0100 Subject: [PATCH 1/2] Fix arrow tests Fix tests for pyarrow 1.0.0 Revert "Add new core members" This reverts commit 7ef7c12 --- ci/deps/azure-windows-38.yaml | 1 + pandas/tests/io/test_parquet.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 1f383164b5328..805a976c92bc0 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -26,6 +26,7 @@ dependencies: - pytables - python-dateutil - pytz + - s3fs>=0.4.0 - scipy - xlrd - xlsxwriter diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 82157f3d722a9..9a3f68bfc09cb 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -557,13 +557,22 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa): @pytest.mark.parametrize("partition_col", [["A"], []]) def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # GH #26388 - # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 - # As per pyarrow partitioned columns become 'categorical' dtypes + expected_df = df_compat.copy() + + # read_table uses the new Arrow Datasets API since pyarrow 1.0.0 + # Previous behaviour was pyarrow partitioned columns become 'categorical' dtypes # and are added to back of dataframe on read - expected_df = df_compat.copy() - if partition_col: - expected_df[partition_col] = expected_df[partition_col].astype("category") + legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0") + if partition_col and legacy_read_table: + partition_col_type = "category" + else: + partition_col_type = "int32" + + expected_df[partition_col] = expected_df[partition_col].astype( + partition_col_type + ) + check_round_trip( df_compat, pa, From 2484e197363eb7a44ca525367ac9dfb6d0429b16 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Sat, 22 Aug 2020 18:15:15 +0100 Subject: [PATCH 2/2] Remove typo --- pandas/tests/io/test_parquet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9d5a7ff98065d..4e0c16c71a6a8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -702,7 +702,6 @@ def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so # this should work without error df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) - df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": "2.0"}) @td.skip_if_no("pyarrow", min_version="0.17")