From 89e6f9a447b4fc0a6643d9dc590a7b28515b75be Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 23 Feb 2023 16:14:34 -0800 Subject: [PATCH 01/24] ENH: Add filesystem to read_parquet/to_parquet --- pandas/io/parquet.py | 54 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index aec31f40f8570..9f31cc1f8a48a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -86,11 +86,17 @@ def _get_path_or_handle( """File handling for PyArrow.""" path_or_handle = stringify_path(path) if is_fsspec_url(path_or_handle) and fs is None: + pa = import_optional_dependency("pyarrow") + pa_fs = import_optional_dependency("pyarrow.fs") fsspec = import_optional_dependency("fsspec") - fs, path_or_handle = fsspec.core.url_to_fs( - path_or_handle, **(storage_options or {}) - ) + try: + fs, path_or_handle = pa_fs.FileSystem.from_uri(path) + fs = fsspec.implementations.arrow.ArrowFSWrapper(fs) + except (TypeError, pa.ArrowInvalid): + fs, path_or_handle = fsspec.core.url_to_fs( + path_or_handle, **(storage_options or {}) + ) elif storage_options and (not is_url(path_or_handle) or mode != "rb"): # can't write to a remote url # without making use of fsspec at the moment @@ -169,6 +175,7 @@ def write( index: bool | None = None, storage_options: StorageOptions = None, partition_cols: list[str] | None = None, + filesystem=None, **kwargs, ) -> None: self.validate_dataframe(df) @@ -179,9 +186,9 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) - path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( + path_or_handle, handles, filesystem = _get_path_or_handle( path, - kwargs.pop("filesystem", None), + filesystem, storage_options=storage_options, mode="wb", is_dir=partition_cols is not None, @@ -203,12 +210,17 @@ def write( path_or_handle, compression=compression, partition_cols=partition_cols, + filesystem=filesystem, **kwargs, ) else: # write to single output file self.api.parquet.write_table( - table, path_or_handle, compression=compression, **kwargs + table, + path_or_handle, + compression=compression, + filesystem=filesystem, + **kwargs, ) finally: if handles is not None: @@ -220,6 +232,7 @@ def read( columns=None, use_nullable_dtypes: bool = False, storage_options: StorageOptions = None, + filesystem=None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True @@ -237,15 +250,15 @@ def read( if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] - path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( + path_or_handle, handles, filesystem = _get_path_or_handle( path, - kwargs.pop("filesystem", None), + filesystem, storage_options=storage_options, mode="rb", ) try: pa_table = self.api.parquet.read_table( - path_or_handle, columns=columns, **kwargs + path_or_handle, columns=columns, filesystem=filesystem, **kwargs ) if dtype_backend == "pandas": result = pa_table.to_pandas(**to_pandas_kwargs) @@ -323,7 +336,12 @@ def write( ) def read( - self, path, columns=None, storage_options: StorageOptions = None, **kwargs + self, + path, + columns=None, + storage_options: StorageOptions = None, + filesystem=None, + **kwargs, ) -> DataFrame: parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) @@ -335,6 +353,10 @@ def read( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine" ) + if filesystem is not None: + raise NotImplementedError( + "filesystem is not implemented for the fastparquet engine." + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -455,6 +477,7 @@ def read_parquet( columns: list[str] | None = None, storage_options: StorageOptions = None, use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + filesystem: Any = None, **kwargs, ) -> DataFrame: """ @@ -477,6 +500,12 @@ def read_parquet( ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. + + When using the ``'pyarrow'`` engine and no storage options are provided + and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec`` + (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first. + Use the filesystem keyword with an instantiated fsspec filesystem + if you wish to use its implementation. columns : list, default=None If not None, only these columns will be read from the file. @@ -505,6 +534,9 @@ def read_parquet( .. versionadded:: 2.0.0 + filesystem: fsspec or pyarrow filesystem, default None + Filesystem object to use when reading the parquet file. Only implemented + for ``engine-"pyarrow"``. **kwargs Any additional kwargs are passed to the engine. @@ -525,5 +557,5 @@ def read_parquet( columns=columns, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, - **kwargs, + filesystem=filesystem**kwargs, ) From 06a2b1833952dc2ea6af5de14a2f4e9848201a66 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 23 Feb 2023 16:16:27 -0800 Subject: [PATCH 02/24] Add to to_parquet --- pandas/io/parquet.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9f31cc1f8a48a..ad0b576f3d515 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -396,6 +396,7 @@ def to_parquet( index: bool | None = None, storage_options: StorageOptions = None, partition_cols: list[str] | None = None, + filepath: Any = None, **kwargs, ) -> bytes | None: """ @@ -440,6 +441,12 @@ def to_parquet( .. versionadded:: 1.2.0 + filesystem: fsspec or pyarrow filesystem, default None + Filesystem object to use when reading the parquet file. Only implemented + for ``engine-"pyarrow"``. + + .. versionadded:: 2.1.0 + kwargs Additional keyword arguments passed to the engine @@ -537,6 +544,9 @@ def read_parquet( filesystem: fsspec or pyarrow filesystem, default None Filesystem object to use when reading the parquet file. Only implemented for ``engine-"pyarrow"``. + + .. versionadded:: 2.1.0 + **kwargs Any additional kwargs are passed to the engine. From 0b07437c59e02f59f84360510a45189495612581 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 23 Feb 2023 16:37:15 -0800 Subject: [PATCH 03/24] Bump fsspec --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38-minimum_versions.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v2.1.0.rst | 2 +- environment.yml | 2 +- pandas/compat/_optional.py | 2 +- pyproject.toml | 4 ++-- requirements-dev.txt | 2 +- 13 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 64f9a3fd1ffbc..8742d3da6298c 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -26,7 +26,7 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2021.07.0 + - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - gcsfs>=2021.07.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d474df1e75655..0ab1a3e298fce 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -26,7 +26,7 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2021.07.0 + - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - gcsfs>=2021.07.0 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index a9cd4c93dd604..d60dcd68dd085 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -27,7 +27,7 @@ dependencies: - brotlipy>=0.7.0 - bottleneck>=1.3.2 - fastparquet>=0.6.3 - - fsspec>=2021.07.0 + - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - gcsfs>=2021.07.0 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 6877d7f14f66a..b7ff711bad04b 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -28,7 +28,7 @@ dependencies: - bottleneck=1.3.2 - brotlipy=0.7.0 - fastparquet=0.6.3 - - fsspec=2021.07.0 + - fsspec=2022.03.0 - html5lib=1.1 - hypothesis=6.34.2 - gcsfs=2021.07.0 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index ccde0f57f7bc4..35b4d1f66548b 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -26,7 +26,7 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2021.07.0 + - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - gcsfs>=2021.07.0 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index aeb887d7ec1ab..8c2e3803a4522 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -26,7 +26,7 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2021.07.0 + - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - gcsfs>=2021.07.0 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 0d1a5f765b5ce..c6c5dfd6f00ff 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -26,7 +26,7 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2021.07.0 + - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - gcsfs>=2021.07.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 28a57720a89a5..e08c4c01f8a56 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -469,7 +469,7 @@ Installable with ``pip install "pandas[fss, aws, gcp]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -fsspec 2021.7.0 fss, gcp, aws Handling files aside from simple local and HTTP (required +fsspec 2022.3.0 fss, gcp, aws Handling files aside from simple local and HTTP (required dependency of s3fs, gcsfs). gcsfs 2021.7.0 gcp Google Cloud Storage access pandas-gbq 0.15.0 gcp Google Big Query access diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f6a6c81bfe25d..811775d320dcd 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -75,7 +75,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | Package | Minimum Version | Changed | +=================+=================+=========+ -| | | X | +| fsspec | 2022.3.0 | X | +-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. diff --git a/environment.yml b/environment.yml index e748d20d6d6f0..608c022c4d404 100644 --- a/environment.yml +++ b/environment.yml @@ -28,7 +28,7 @@ dependencies: - brotlipy>=0.7.0 - bottleneck>=1.3.2 - fastparquet>=0.6.3 - - fsspec>=2021.07.0 + - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - gcsfs>=2021.07.0 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 01ac462eeb659..8ffc43a4dc9fb 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -17,7 +17,7 @@ "bottleneck": "1.3.2", "brotli": "0.7.0", "fastparquet": "0.6.3", - "fsspec": "2021.07.0", + "fsspec": "2022.03.0", "html5lib": "1.1", "hypothesis": "6.34.2", "gcsfs": "2021.07.0", diff --git a/pyproject.toml b/pyproject.toml index c3a7cb013ca6c..d1fb291f41b3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ test = ['hypothesis>=6.34.2', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-as performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] -fss = ['fsspec>=2021.07.0'] +fss = ['fsspec>=2022.03.0'] aws = ['s3fs>=2021.08.0'] gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] @@ -84,7 +84,7 @@ all = ['beautifulsoup4>=4.9.3', 'bottleneck>=1.3.2', 'brotlipy>=0.7.0', 'fastparquet>=0.6.3', - 'fsspec>=2021.07.0', + 'fsspec>=2022.03.0', 'gcsfs>=2021.07.0', 'html5lib>=1.1', 'hypothesis>=6.34.2', diff --git a/requirements-dev.txt b/requirements-dev.txt index 0329588de17fd..8297b0254ef21 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,7 +17,7 @@ blosc brotlipy>=0.7.0 bottleneck>=1.3.2 fastparquet>=0.6.3 -fsspec>=2021.07.0 +fsspec>=2022.03.0 html5lib>=1.1 hypothesis>=6.34.2 gcsfs>=2021.07.0 From f57a19525959529eb92a17d15f88e78f813fe3e1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 23 Feb 2023 18:18:01 -0800 Subject: [PATCH 04/24] fix import --- pandas/io/parquet.py | 15 ++++++++++++--- pandas/tests/io/test_parquet.py | 13 +++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ad0b576f3d515..682af96418784 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -91,8 +91,9 @@ def _get_path_or_handle( fsspec = import_optional_dependency("fsspec") try: + fs_arrow = import_optional_dependency("fsspec.implementations.arrow") fs, path_or_handle = pa_fs.FileSystem.from_uri(path) - fs = fsspec.implementations.arrow.ArrowFSWrapper(fs) + fs = fs_arrow.ArrowFSWrapper(fs) except (TypeError, pa.ArrowInvalid): fs, path_or_handle = fsspec.core.url_to_fs( path_or_handle, **(storage_options or {}) @@ -296,6 +297,7 @@ def write( index=None, partition_cols=None, storage_options: StorageOptions = None, + filesystem=None, **kwargs, ) -> None: self.validate_dataframe(df) @@ -311,6 +313,11 @@ def write( if partition_cols is not None: kwargs["file_scheme"] = "hive" + if filesystem is not None: + raise NotImplementedError( + "filesystem is not implemented for the fastparquet engine." + ) + # cannot use get_handle as write() does not accept file buffers path = stringify_path(path) if is_fsspec_url(path): @@ -396,7 +403,7 @@ def to_parquet( index: bool | None = None, storage_options: StorageOptions = None, partition_cols: list[str] | None = None, - filepath: Any = None, + filesystem: Any = None, **kwargs, ) -> bytes | None: """ @@ -467,6 +474,7 @@ def to_parquet( index=index, partition_cols=partition_cols, storage_options=storage_options, + filesystem=filesystem, **kwargs, ) @@ -567,5 +575,6 @@ def read_parquet( columns=columns, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, - filesystem=filesystem**kwargs, + filesystem=filesystem, + **kwargs, ) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 353dc4f1cbd8a..8a404bea824d8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1225,3 +1225,16 @@ def test_bytes_file_name(self, engine): result = read_parquet(path, engine=engine) tm.assert_frame_equal(result, df) + + def test_filesystem_notimplemented(self): + df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) + with pytest.raises(NotImplementedError, match="filesystem is not implemented"): + with tm.ensure_clean() as path: + df.to_parquet(path, engine="fastparquet", filesystem="foo") + + with tm.ensure_clean() as path: + pathlib.Path(path).write_bytes(b"foo") + with pytest.raises( + NotImplementedError, match="filesystem is not implemented" + ): + read_parquet(path, engine="fastparquet", filesystem="foo") From 6503a7e51bdd504f92a6bf5a2e327d3ed96012ca Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 23 Feb 2023 20:22:03 -0800 Subject: [PATCH 05/24] Mock gcs to local for parquet --- pandas/tests/io/test_gcs.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a609c1b5fc03d..3261801ee8674 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -43,9 +43,24 @@ def ls(self, path, **kwargs): return gcs_buffer +@pytest.fixture +def mock_pa_filesystem(monkeypatch): + pa_fs = pytest.importorskip("pyarrow.fs") + + class MockFileSystem(pa_fs.FileSystem): + @staticmethod + def from_uri(path): + to_local = path.replace("gs", "file") + return pa_fs.LocalFileSystem.from_uri(to_local), to_local + + with monkeypatch.context() as m: + m.setattr(pa_fs, "FileSystem", MockFileSystem) + yield + + @td.skip_if_no("gcsfs") @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format): +def test_to_read_gcs(gcs_buffer, format, mock_pa_filesystem): """ Test that many to/read functions support GCS. From dd2760444b0402155c406b8d3744aa710137b5f7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 Feb 2023 13:13:17 -0800 Subject: [PATCH 06/24] Fix condidition, add whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/io/parquet.py | 23 ++++++++++++++++------- pandas/tests/io/test_gcs.py | 2 +- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2cc766186787c..03e5a3ccc363a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -103,7 +103,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) -- +- Performance improvement in :meth:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 682af96418784..0cadfa60c32cf 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -86,15 +86,18 @@ def _get_path_or_handle( """File handling for PyArrow.""" path_or_handle = stringify_path(path) if is_fsspec_url(path_or_handle) and fs is None: - pa = import_optional_dependency("pyarrow") - pa_fs = import_optional_dependency("pyarrow.fs") fsspec = import_optional_dependency("fsspec") + if storage_options is None: + pa = import_optional_dependency("pyarrow") + pa_fs = import_optional_dependency("pyarrow.fs") - try: - fs_arrow = import_optional_dependency("fsspec.implementations.arrow") - fs, path_or_handle = pa_fs.FileSystem.from_uri(path) - fs = fs_arrow.ArrowFSWrapper(fs) - except (TypeError, pa.ArrowInvalid): + try: + fs_arrow = import_optional_dependency("fsspec.implementations.arrow") + fs, path_or_handle = pa_fs.FileSystem.from_uri(path) + fs = fs_arrow.ArrowFSWrapper(fs) + except (TypeError, pa.ArrowInvalid): + pass + if fs is None: fs, path_or_handle = fsspec.core.url_to_fs( path_or_handle, **(storage_options or {}) ) @@ -426,6 +429,12 @@ def to_parquet( ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. + + When using the ``'pyarrow'`` engine and no storage options are provided + and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec`` + (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first. + Use the filesystem keyword with an instantiated fsspec filesystem + if you wish to use its implementation. compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}}, default 'snappy'. Name of the compression to use. Use ``None`` for no compression. The supported compression methods actually diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 3261801ee8674..882314e764654 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -20,7 +20,7 @@ @pytest.fixture -def gcs_buffer(monkeypatch): +def gcs_buffer(): """Emulate GCS using a binary buffer.""" import fsspec From 4ce7da83c00273fdaa7de2fbfef6d27a1ca4ec80 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 Feb 2023 14:04:03 -0800 Subject: [PATCH 07/24] address tests, bump gcsfs --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38-minimum_versions.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v2.1.0.rst | 3 ++- environment.yml | 2 +- pandas/compat/_optional.py | 2 +- pandas/tests/io/test_parquet.py | 1 + pyproject.toml | 4 ++-- requirements-dev.txt | 2 +- 14 files changed, 16 insertions(+), 14 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 8742d3da6298c..189d962f4abc6 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -29,7 +29,7 @@ dependencies: - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2021.07.0 + - gcsfs>=2022.03.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 0ab1a3e298fce..6d5fe6af569a4 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -29,7 +29,7 @@ dependencies: - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2021.07.0 + - gcsfs>=2022.03.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index d60dcd68dd085..49a604483203f 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -30,7 +30,7 @@ dependencies: - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2021.07.0 + - gcsfs>=2022.03.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index b7ff711bad04b..f37cb2367accb 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -31,7 +31,7 @@ dependencies: - fsspec=2022.03.0 - html5lib=1.1 - hypothesis=6.34.2 - - gcsfs=2021.07.0 + - gcsfs=2022.03.0 - jinja2=3.0.0 - lxml=4.6.3 - matplotlib=3.6.1 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 35b4d1f66548b..4229afd3bc861 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -29,7 +29,7 @@ dependencies: - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2021.07.0 + - gcsfs>=2022.03.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 8c2e3803a4522..8030930f5a321 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -29,7 +29,7 @@ dependencies: - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2021.07.0 + - gcsfs>=2022.03.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index c6c5dfd6f00ff..b896528be14a4 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -29,7 +29,7 @@ dependencies: - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2021.07.0 + - gcsfs>=2022.03.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index e08c4c01f8a56..e6889a4a43679 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -471,7 +471,7 @@ Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= fsspec 2022.3.0 fss, gcp, aws Handling files aside from simple local and HTTP (required dependency of s3fs, gcsfs). -gcsfs 2021.7.0 gcp Google Cloud Storage access +gcsfs 2022.3.0 gcp Google Cloud Storage access pandas-gbq 0.15.0 gcp Google Big Query access s3fs 2021.08.0 aws Amazon S3 access ========================= ================== =============== ============================================================= diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 03e5a3ccc363a..9a5d33ae19865 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -77,7 +77,8 @@ Optional libraries below the lowest tested version may still work, but are not c +=================+=================+=========+ | fsspec | 2022.3.0 | X | +-----------------+-----------------+---------+ - +| gcsfs | 2022.3.0 | X | ++-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. .. _whatsnew_210.api_breaking.other: diff --git a/environment.yml b/environment.yml index 608c022c4d404..39948c26a6f24 100644 --- a/environment.yml +++ b/environment.yml @@ -31,7 +31,7 @@ dependencies: - fsspec>=2022.03.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2021.07.0 + - gcsfs>=2022.03.0 - ipython - jinja2>=3.0.0 - lxml>=4.6.3 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 8ffc43a4dc9fb..4d6b51e167228 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -20,7 +20,7 @@ "fsspec": "2022.03.0", "html5lib": "1.1", "hypothesis": "6.34.2", - "gcsfs": "2021.07.0", + "gcsfs": "2022.03.0", "jinja2": "3.0.0", "lxml.etree": "4.6.3", "matplotlib": "3.6.1", diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8a404bea824d8..264726d204ca7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1227,6 +1227,7 @@ def test_bytes_file_name(self, engine): tm.assert_frame_equal(result, df) def test_filesystem_notimplemented(self): + pytest.importorskip("fastparquet") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) with pytest.raises(NotImplementedError, match="filesystem is not implemented"): with tm.ensure_clean() as path: diff --git a/pyproject.toml b/pyproject.toml index d1fb291f41b3f..08a98431bb01b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2022.03.0'] aws = ['s3fs>=2021.08.0'] -gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] +gcp = ['gcsfs>=2022.03.0', 'pandas-gbq>=0.15.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] @@ -85,7 +85,7 @@ all = ['beautifulsoup4>=4.9.3', 'brotlipy>=0.7.0', 'fastparquet>=0.6.3', 'fsspec>=2022.03.0', - 'gcsfs>=2021.07.0', + 'gcsfs>=2022.03.0', 'html5lib>=1.1', 'hypothesis>=6.34.2', 'jinja2>=3.0.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index 8297b0254ef21..e8a787000985a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -20,7 +20,7 @@ fastparquet>=0.6.3 fsspec>=2022.03.0 html5lib>=1.1 hypothesis>=6.34.2 -gcsfs>=2021.07.0 +gcsfs>=2022.03.0 ipython jinja2>=3.0.0 lxml>=4.6.3 From e1f8912cfa62aef248deeb31b5a26d8ca37157cb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 Feb 2023 14:47:01 -0800 Subject: [PATCH 08/24] bump s3fs --- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38-minimum_versions.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v2.1.0.rst | 2 ++ environment.yml | 2 +- pandas/compat/_optional.py | 2 +- pyproject.toml | 4 ++-- requirements-dev.txt | 2 +- 13 files changed, 15 insertions(+), 13 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 189d962f4abc6..fc3e5b656e15b 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -45,7 +45,7 @@ dependencies: - pyreadstat>=1.1.2 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2021.08.0 + - s3fs>=2022.03.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 6d5fe6af569a4..21742cc880bce 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -45,7 +45,7 @@ dependencies: - pyreadstat>=1.1.2 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2021.08.0 + - s3fs>=2022.03.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 49a604483203f..fc286bd18b37c 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -45,7 +45,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2021.08.0 + - s3fs>=2022.03.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index f37cb2367accb..d2b16fc482ebf 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -48,7 +48,7 @@ dependencies: - pytables=3.6.1 - python-snappy=0.6.0 - pyxlsb=1.0.8 - - s3fs=2021.08.0 + - s3fs=2022.03.0 - scipy=1.7.1 - sqlalchemy=1.4.16 - tabulate=0.8.9 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 4229afd3bc861..841feb3ce75f5 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -45,7 +45,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2021.08.0 + - s3fs>=2022.03.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 8030930f5a321..a633a36731036 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -45,7 +45,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2021.08.0 + - s3fs>=2022.03.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index b896528be14a4..67dcb89a6f082 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -46,7 +46,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2021.08.0 + - s3fs>=2022.03.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index e6889a4a43679..cb530299fdf30 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -473,7 +473,7 @@ fsspec 2022.3.0 fss, gcp, aws Handling files asid dependency of s3fs, gcsfs). gcsfs 2022.3.0 gcp Google Cloud Storage access pandas-gbq 0.15.0 gcp Google Big Query access -s3fs 2021.08.0 aws Amazon S3 access +s3fs 2022.3.0 aws Amazon S3 access ========================= ================== =============== ============================================================= Clipboard diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9a5d33ae19865..64b1ecdc93473 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -79,6 +79,8 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | gcsfs | 2022.3.0 | X | +-----------------+-----------------+---------+ +| s3fs | 2022.3.0 | X | ++-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. .. _whatsnew_210.api_breaking.other: diff --git a/environment.yml b/environment.yml index 39948c26a6f24..78c64750416b2 100644 --- a/environment.yml +++ b/environment.yml @@ -48,7 +48,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2021.08.0 + - s3fs>=2022.03.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 4d6b51e167228..8e1fd742c9ae3 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -35,7 +35,7 @@ "pyreadstat": "1.1.2", "pytest": "7.0.0", "pyxlsb": "1.0.8", - "s3fs": "2021.08.0", + "s3fs": "2022.03.0", "scipy": "1.7.1", "snappy": "0.6.0", "sqlalchemy": "1.4.16", diff --git a/pyproject.toml b/pyproject.toml index 08a98431bb01b..868d456c87497 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] fss = ['fsspec>=2022.03.0'] -aws = ['s3fs>=2021.08.0'] +aws = ['s3fs>=2022.03.0'] gcp = ['gcsfs>=2022.03.0', 'pandas-gbq>=0.15.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] @@ -108,7 +108,7 @@ all = ['beautifulsoup4>=4.9.3', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', - 's3fs>=2021.08.0', + 's3fs>=2022.03.0', 'SQLAlchemy>=1.4.16', 'tables>=3.6.1', 'tabulate>=0.8.9', diff --git a/requirements-dev.txt b/requirements-dev.txt index e8a787000985a..9d131e8a22b65 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -37,7 +37,7 @@ pyreadstat>=1.1.2 tables>=3.6.1 python-snappy>=0.6.0 pyxlsb>=1.0.8 -s3fs>=2021.08.0 +s3fs>=2022.03.0 scipy>=1.7.1 SQLAlchemy>=1.4.16 tabulate>=0.8.9 From c5166b5da005a6abcb0c441aad036afca5dbf527 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 25 Feb 2023 13:07:00 -0800 Subject: [PATCH 09/24] Fix doc issues --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/io/parquet.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 8086f27d5de58..d3809fb24478b 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -81,6 +81,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | s3fs | 2022.3.0 | X | +-----------------+-----------------+---------+ + See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. .. _whatsnew_210.api_breaking.other: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 0cadfa60c32cf..fe81ff907a6f8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -457,7 +457,7 @@ def to_parquet( .. versionadded:: 1.2.0 - filesystem: fsspec or pyarrow filesystem, default None + filesystem : fsspec or pyarrow filesystem, default None Filesystem object to use when reading the parquet file. Only implemented for ``engine-"pyarrow"``. @@ -558,7 +558,7 @@ def read_parquet( .. versionadded:: 2.0.0 - filesystem: fsspec or pyarrow filesystem, default None + filesystem : fsspec or pyarrow filesystem, default None Filesystem object to use when reading the parquet file. Only implemented for ``engine-"pyarrow"``. From 7ec7d75b6cbd5adb8081b7c7525e171eb1b0e012 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 25 Feb 2023 13:19:31 -0800 Subject: [PATCH 10/24] Try without fsspec wrapper --- pandas/io/parquet.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index fe81ff907a6f8..48eb74208c572 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -92,9 +92,7 @@ def _get_path_or_handle( pa_fs = import_optional_dependency("pyarrow.fs") try: - fs_arrow = import_optional_dependency("fsspec.implementations.arrow") fs, path_or_handle = pa_fs.FileSystem.from_uri(path) - fs = fs_arrow.ArrowFSWrapper(fs) except (TypeError, pa.ArrowInvalid): pass if fs is None: From c1161d3505a67019efcbc46da3c7d87c3534e439 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Mar 2023 16:29:06 -0800 Subject: [PATCH 11/24] Revert "Try without fsspec wrapper" This reverts commit 7ec7d75b6cbd5adb8081b7c7525e171eb1b0e012. --- pandas/io/parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 09e9fc5880c66..8ca663123e348 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -95,7 +95,9 @@ def _get_path_or_handle( pa_fs = import_optional_dependency("pyarrow.fs") try: + fs_arrow = import_optional_dependency("fsspec.implementations.arrow") fs, path_or_handle = pa_fs.FileSystem.from_uri(path) + fs = fs_arrow.ArrowFSWrapper(fs) except (TypeError, pa.ArrowInvalid): pass if fs is None: From ef7095aa45ff9eaad31f37026384b3b204821772 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Mar 2023 16:49:29 -0800 Subject: [PATCH 12/24] Returns a tuple --- pandas/tests/io/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 882314e764654..80135b407e262 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -51,7 +51,7 @@ class MockFileSystem(pa_fs.FileSystem): @staticmethod def from_uri(path): to_local = path.replace("gs", "file") - return pa_fs.LocalFileSystem.from_uri(to_local), to_local + return pa_fs.LocalFileSystem.from_uri(to_local)[0], to_local with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) From 08f3a30340f866b9115e5e4c0fbef8a8abb86d5b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Mar 2023 17:03:39 -0800 Subject: [PATCH 13/24] Don't wrap in fsspec, undo deps bump --- ci/deps/actions-310.yaml | 6 +++--- ci/deps/actions-311.yaml | 6 +++--- ci/deps/actions-38-downstream_compat.yaml | 6 +++--- ci/deps/actions-38-minimum_versions.yaml | 6 +++--- ci/deps/actions-38.yaml | 6 +++--- ci/deps/actions-39.yaml | 6 +++--- ci/deps/circle-38-arm64.yaml | 6 +++--- environment.yml | 6 +++--- pandas/compat/_optional.py | 6 +++--- pandas/io/parquet.py | 2 -- pyproject.toml | 12 ++++++------ requirements-dev.txt | 6 +++--- 12 files changed, 36 insertions(+), 38 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 52cb92957237c..53a10ed10e4ff 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -26,10 +26,10 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2022.03.0 + - fsspec>=2021.07.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2022.03.0 + - gcsfs>=2021.07.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 @@ -45,7 +45,7 @@ dependencies: - pyreadstat>=1.1.2 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2022.03.0 + - s3fs>=2021.08.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 8142f79ad80a4..9b4652db0446c 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -26,10 +26,10 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2022.03.0 + - fsspec>=2021.07.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2022.03.0 + - gcsfs>=2021.07.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 @@ -45,7 +45,7 @@ dependencies: - pyreadstat>=1.1.2 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2022.03.0 + - s3fs>=2021.08.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 34fbe13c8b180..85bc05c4bf65c 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -27,10 +27,10 @@ dependencies: - brotlipy>=0.7.0 - bottleneck>=1.3.2 - fastparquet>=0.6.3 - - fsspec>=2022.03.0 + - fsspec>=2021.07.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2022.03.0 + - gcsfs>=2021.07.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 @@ -45,7 +45,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2022.03.0 + - s3fs>=2021.08.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 8ed18e731d046..e5c32dad04c01 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -28,10 +28,10 @@ dependencies: - bottleneck=1.3.2 - brotlipy=0.7.0 - fastparquet=0.6.3 - - fsspec=2022.03.0 + - fsspec=2021.07.0 - html5lib=1.1 - hypothesis=6.34.2 - - gcsfs=2022.03.0 + - gcsfs=2021.07.0 - jinja2=3.0.0 - lxml=4.6.3 - matplotlib=3.6.1 @@ -48,7 +48,7 @@ dependencies: - pytables=3.6.1 - python-snappy=0.6.0 - pyxlsb=1.0.8 - - s3fs=2022.03.0 + - s3fs=2021.08.0 - scipy=1.7.1 - sqlalchemy=1.4.16 - tabulate=0.8.9 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 6d3ab60082d0c..c65f194830417 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -26,10 +26,10 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2022.03.0 + - fsspec>=2021.07.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2022.03.0 + - gcsfs>=2021.07.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 @@ -45,7 +45,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2022.03.0 + - s3fs>=2021.08.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 4c7263ad61616..0a5039126e49f 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -26,10 +26,10 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2022.03.0 + - fsspec>=2021.07.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2022.03.0 + - gcsfs>=2021.07.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 @@ -45,7 +45,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2022.03.0 + - s3fs>=2021.08.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 8ee0fbf4b0a11..c3d89e735ae37 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -26,10 +26,10 @@ dependencies: - bottleneck>=1.3.2 - brotlipy>=0.7.0 - fastparquet>=0.6.3 - - fsspec>=2022.03.0 + - fsspec>=2021.07.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2022.03.0 + - gcsfs>=2021.07.0 - jinja2>=3.0.0 - lxml>=4.6.3 - matplotlib>=3.6.1, <3.7.0 @@ -46,7 +46,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2022.03.0 + - s3fs>=2021.08.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/environment.yml b/environment.yml index 3df63ae5f849d..2a36247b23eb3 100644 --- a/environment.yml +++ b/environment.yml @@ -28,10 +28,10 @@ dependencies: - brotlipy>=0.7.0 - bottleneck>=1.3.2 - fastparquet>=0.6.3 - - fsspec>=2022.03.0 + - fsspec>=2021.07.0 - html5lib>=1.1 - hypothesis>=6.34.2 - - gcsfs>=2022.03.0 + - gcsfs>=2021.07.0 - ipython - jinja2>=3.0.0 - lxml>=4.6.3 @@ -48,7 +48,7 @@ dependencies: - pytables>=3.6.1 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 - - s3fs>=2022.03.0 + - s3fs>=2021.08.0 - scipy>=1.7.1 - sqlalchemy>=1.4.16 - tabulate>=0.8.9 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 7b92a11624981..bcfd4ea790e64 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -20,10 +20,10 @@ "bottleneck": "1.3.2", "brotli": "0.7.0", "fastparquet": "0.6.3", - "fsspec": "2022.03.0", + "fsspec": "2021.07.0", "html5lib": "1.1", "hypothesis": "6.34.2", - "gcsfs": "2022.03.0", + "gcsfs": "2021.07.0", "jinja2": "3.0.0", "lxml.etree": "4.6.3", "matplotlib": "3.6.1", @@ -38,7 +38,7 @@ "pyreadstat": "1.1.2", "pytest": "7.0.0", "pyxlsb": "1.0.8", - "s3fs": "2022.03.0", + "s3fs": "2021.08.0", "scipy": "1.7.1", "snappy": "0.6.0", "sqlalchemy": "1.4.16", diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 8ca663123e348..09e9fc5880c66 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -95,9 +95,7 @@ def _get_path_or_handle( pa_fs = import_optional_dependency("pyarrow.fs") try: - fs_arrow = import_optional_dependency("fsspec.implementations.arrow") fs, path_or_handle = pa_fs.FileSystem.from_uri(path) - fs = fs_arrow.ArrowFSWrapper(fs) except (TypeError, pa.ArrowInvalid): pass if fs is None: diff --git a/pyproject.toml b/pyproject.toml index cf0b552a8c9ab..c5c9cd702a380 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,9 +59,9 @@ test = ['hypothesis>=6.34.2', 'pytest>=7.0.0', 'pytest-xdist>=2.2.0', 'pytest-as performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] -fss = ['fsspec>=2022.03.0'] -aws = ['s3fs>=2022.03.0'] -gcp = ['gcsfs>=2022.03.0', 'pandas-gbq>=0.15.0'] +fss = ['fsspec>=2021.07.0'] +aws = ['s3fs>=2021.08.0'] +gcp = ['gcsfs>=2021.07.0', 'pandas-gbq>=0.15.0'] excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3'] parquet = ['pyarrow>=7.0.0'] feather = ['pyarrow>=7.0.0'] @@ -84,8 +84,8 @@ all = ['beautifulsoup4>=4.9.3', 'bottleneck>=1.3.2', 'brotlipy>=0.7.0', 'fastparquet>=0.6.3', - 'fsspec>=2022.03.0', - 'gcsfs>=2022.03.0', + 'fsspec>=2021.07.0', + 'gcsfs>=2021.07.0', 'html5lib>=1.1', 'hypothesis>=6.34.2', 'jinja2>=3.0.0', @@ -108,7 +108,7 @@ all = ['beautifulsoup4>=4.9.3', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', - 's3fs>=2022.03.0', + 's3fs>=2021.08.0', 'SQLAlchemy>=1.4.16', 'tables>=3.6.1', 'tabulate>=0.8.9', diff --git a/requirements-dev.txt b/requirements-dev.txt index 105b2765f5547..74a55c289fbe0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,10 +17,10 @@ blosc brotlipy>=0.7.0 bottleneck>=1.3.2 fastparquet>=0.6.3 -fsspec>=2022.03.0 +fsspec>=2021.07.0 html5lib>=1.1 hypothesis>=6.34.2 -gcsfs>=2022.03.0 +gcsfs>=2021.07.0 ipython jinja2>=3.0.0 lxml>=4.6.3 @@ -37,7 +37,7 @@ pyreadstat>=1.1.2 tables>=3.6.1 python-snappy>=0.6.0 pyxlsb>=1.0.8 -s3fs>=2022.03.0 +s3fs>=2021.08.0 scipy>=1.7.1 SQLAlchemy>=1.4.16 tabulate>=0.8.9 From 7703f0fdd4b86d18b9fb50396bc6f87ea8eb0859 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Mar 2023 17:04:56 -0800 Subject: [PATCH 14/24] Fix whatsnew --- doc/source/whatsnew/v2.1.0.rst | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 058110caca72a..ea208c49fdd48 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -77,11 +77,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | Package | Minimum Version | Changed | +=================+=================+=========+ -| fsspec | 2022.3.0 | X | -+-----------------+-----------------+---------+ -| gcsfs | 2022.3.0 | X | -+-----------------+-----------------+---------+ -| s3fs | 2022.3.0 | X | +| | | X | +-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. @@ -114,8 +110,8 @@ Performance improvements - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is backed by an extension dtype (:issue:`51574`) -- Performance improvement in :meth:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) -- Performance improvement in :meth:`read_orc` when reading a remote URI file path. (:issue:`51609`) +- Performance improvement in :func:`read_orc` when reading a remote URI file path. (:issue:`51609`) +- Performance improvement in :func:`read_parquet` and :meth:`DataFrame.to_parquet` when reading a remote file with ``engine="pyarrow"`` (:issue:`51609`) - Performance improvement in :meth:`MultiIndex.sortlevel` when ``ascending`` is a list (:issue:`51612`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`) - Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`) From e99d234af0ed45b559c6a5cbe85db09c18a67253 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Mar 2023 17:45:34 -0800 Subject: [PATCH 15/24] Add validations for filesystem --- pandas/io/parquet.py | 23 +++++++++++++-- pandas/tests/io/test_parquet.py | 50 +++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 09e9fc5880c66..878cb7a79de1f 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -88,6 +88,25 @@ def _get_path_or_handle( ]: """File handling for PyArrow.""" path_or_handle = stringify_path(path) + if fs is not None: + pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore") + fsspec = import_optional_dependency("fsspec", errors="ignore") + if pa_fs is None and fsspec is None: + raise ValueError( + f"filesystem must be a pyarrow or fsspec FileSystem, " + f"not a {type(fs).__name__}" + ) + elif (pa_fs is not None and not isinstance(fs, pa_fs.FileSystem)) and ( + fsspec is not None and not isinstance(fs, fsspec.spec.AbstractFileSystem) + ): + raise ValueError( + f"filesystem must be a pyarrow or fsspec FileSystem, " + f"not a {type(fs).__name__}" + ) + elif pa_fs is not None and isinstance(fs, pa_fs.FileSystem) and storage_options: + raise NotImplementedError( + "storage_options not supported with a pyarrow Filesystem." + ) if is_fsspec_url(path_or_handle) and fs is None: fsspec = import_optional_dependency("fsspec") if storage_options is None: @@ -456,7 +475,7 @@ def to_parquet( filesystem : fsspec or pyarrow filesystem, default None Filesystem object to use when reading the parquet file. Only implemented - for ``engine-"pyarrow"``. + for ``engine="pyarrow"``. .. versionadded:: 2.1.0 @@ -557,7 +576,7 @@ def read_parquet( filesystem : fsspec or pyarrow filesystem, default None Filesystem object to use when reading the parquet file. Only implemented - for ``engine-"pyarrow"``. + for ``engine="pyarrow"``. .. versionadded:: 2.1.0 diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index de73216555151..1071401ff8a7e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1236,8 +1236,10 @@ def test_bytes_file_name(self, engine): def test_filesystem_notimplemented(self): pytest.importorskip("fastparquet") df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) - with pytest.raises(NotImplementedError, match="filesystem is not implemented"): - with tm.ensure_clean() as path: + with tm.ensure_clean() as path: + with pytest.raises( + NotImplementedError, match="filesystem is not implemented" + ): df.to_parquet(path, engine="fastparquet", filesystem="foo") with tm.ensure_clean() as path: @@ -1246,3 +1248,47 @@ def test_filesystem_notimplemented(self): NotImplementedError, match="filesystem is not implemented" ): read_parquet(path, engine="fastparquet", filesystem="foo") + + def test_invalid_filesystem(self): + pytest.importorskip("pyarrow") + df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) + with tm.ensure_clean() as path: + with pytest.raises( + ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" + ): + df.to_parquet(path, engine="pyarrow", filesystem="foo") + + with tm.ensure_clean() as path: + pathlib.Path(path).write_bytes(b"foo") + with pytest.raises( + ValueError, match="filesystem must be a pyarrow or fsspec FileSystem" + ): + read_parquet(path, engine="pyarrow", filesystem="foo") + + def test_unsupported_pa_filesystem_storage_options(self): + pa_fs = pytest.importorskip("pyarrow.fs") + df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]}) + with tm.ensure_clean() as path: + with pytest.raises( + NotImplementedError, + match="storage_options not supported with a pyarrow Filesystem.", + ): + df.to_parquet( + path, + engine="pyarrow", + filesystem=pa_fs.LocalFileSystem(), + storage_options={"foo": "bar"}, + ) + + with tm.ensure_clean() as path: + pathlib.Path(path).write_bytes(b"foo") + with pytest.raises( + NotImplementedError, + match="storage_options not supported with a pyarrow Filesystem.", + ): + read_parquet( + path, + engine="pyarrow", + filesystem=pa_fs.LocalFileSystem(), + storage_options={"foo": "bar"}, + ) From f4ef416910663a08b1f4e490e97c712c68a15b0c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Mar 2023 17:51:10 -0800 Subject: [PATCH 16/24] Validate that mock filesystem is used --- pandas/tests/io/test_gcs.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 80135b407e262..e474c02f89088 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -43,24 +43,9 @@ def ls(self, path, **kwargs): return gcs_buffer -@pytest.fixture -def mock_pa_filesystem(monkeypatch): - pa_fs = pytest.importorskip("pyarrow.fs") - - class MockFileSystem(pa_fs.FileSystem): - @staticmethod - def from_uri(path): - to_local = path.replace("gs", "file") - return pa_fs.LocalFileSystem.from_uri(to_local)[0], to_local - - with monkeypatch.context() as m: - m.setattr(pa_fs, "FileSystem", MockFileSystem) - yield - - @td.skip_if_no("gcsfs") @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, mock_pa_filesystem): +def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): """ Test that many to/read functions support GCS. @@ -89,9 +74,21 @@ def test_to_read_gcs(gcs_buffer, format, mock_pa_filesystem): df1.to_json(path) df2 = read_json(path, convert_dates=["dt"]) elif format == "parquet": - pytest.importorskip("pyarrow") - df1.to_parquet(path) - df2 = read_parquet(path) + pa_fs = pytest.importorskip("pyarrow.fs") + + class MockFileSystem(pa_fs.FileSystem): + @staticmethod + def from_uri(path): + print("Using pyarrow filesystem") + to_local = path.replace("gs", "file") + return pa_fs.LocalFileSystem.from_uri(to_local)[0], to_local + + with monkeypatch.context() as m: + m.setattr(pa_fs, "FileSystem", MockFileSystem) + df1.to_parquet(path) + df2 = read_parquet(path) + captured = capsys.readouterr() + assert captured.out == "Using pyarrow filesystem\nUsing pyarrow filesystem\n" elif format == "markdown": pytest.importorskip("tabulate") df1.to_markdown(path) From 3f0e751ef7ef8a31079d8578282f6f11d422c294 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Mar 2023 17:53:28 -0800 Subject: [PATCH 17/24] Undo install.rst --- doc/source/getting_started/install.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index cb530299fdf30..28a57720a89a5 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -469,11 +469,11 @@ Installable with ``pip install "pandas[fss, aws, gcp]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes ========================= ================== =============== ============================================================= -fsspec 2022.3.0 fss, gcp, aws Handling files aside from simple local and HTTP (required +fsspec 2021.7.0 fss, gcp, aws Handling files aside from simple local and HTTP (required dependency of s3fs, gcsfs). -gcsfs 2022.3.0 gcp Google Cloud Storage access +gcsfs 2021.7.0 gcp Google Cloud Storage access pandas-gbq 0.15.0 gcp Google Big Query access -s3fs 2022.3.0 aws Amazon S3 access +s3fs 2021.08.0 aws Amazon S3 access ========================= ================== =============== ============================================================= Clipboard From 970a08f57c03a3a860a0fa00fc0c759a22224762 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Mar 2023 21:41:20 -0800 Subject: [PATCH 18/24] Try this --- pandas/tests/io/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index e474c02f89088..c9c9eb9783eef 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -81,7 +81,7 @@ class MockFileSystem(pa_fs.FileSystem): def from_uri(path): print("Using pyarrow filesystem") to_local = path.replace("gs", "file") - return pa_fs.LocalFileSystem.from_uri(to_local)[0], to_local + return pa_fs.LocalFileSystem.from_uri(to_local) with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) From 29bce3b96a68154d9132b21e7c9ba09affb95175 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 11 Mar 2023 13:19:27 -0800 Subject: [PATCH 19/24] Make global again? --- pandas/tests/io/test_gcs.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index c9c9eb9783eef..0cb8f0bfb5041 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -43,9 +43,25 @@ def ls(self, path, **kwargs): return gcs_buffer +@pytest.fixture +def mock_pa_filesystem(monkeypatch): + pa_fs = pytest.importorskip("pyarrow.fs") + + class MockFileSystem(pa_fs.FileSystem): + @staticmethod + def from_uri(path): + print("Using pyarrow filesystem") + to_local = path.replace("gs", "file") + return pa_fs.LocalFileSystem.from_uri(to_local) + + with monkeypatch.context() as m: + m.setattr(pa_fs, "FileSystem", MockFileSystem) + yield + + @td.skip_if_no("gcsfs") @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): +def test_to_read_gcs(gcs_buffer, format, mock_pa_filesystem, capsys): """ Test that many to/read functions support GCS. @@ -74,19 +90,9 @@ def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): df1.to_json(path) df2 = read_json(path, convert_dates=["dt"]) elif format == "parquet": - pa_fs = pytest.importorskip("pyarrow.fs") - - class MockFileSystem(pa_fs.FileSystem): - @staticmethod - def from_uri(path): - print("Using pyarrow filesystem") - to_local = path.replace("gs", "file") - return pa_fs.LocalFileSystem.from_uri(to_local) - - with monkeypatch.context() as m: - m.setattr(pa_fs, "FileSystem", MockFileSystem) - df1.to_parquet(path) - df2 = read_parquet(path) + pytest.importorskip("pyarrow") + df1.to_parquet(path) + df2 = read_parquet(path) captured = capsys.readouterr() assert captured.out == "Using pyarrow filesystem\nUsing pyarrow filesystem\n" elif format == "markdown": From 0017ae5a86f34747dc92dbb248aa3a81c60f4ce5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sun, 12 Mar 2023 21:26:20 -0700 Subject: [PATCH 20/24] Try this --- pandas/tests/io/test_gcs.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 0cb8f0bfb5041..64b6c13d27dd6 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -43,25 +43,9 @@ def ls(self, path, **kwargs): return gcs_buffer -@pytest.fixture -def mock_pa_filesystem(monkeypatch): - pa_fs = pytest.importorskip("pyarrow.fs") - - class MockFileSystem(pa_fs.FileSystem): - @staticmethod - def from_uri(path): - print("Using pyarrow filesystem") - to_local = path.replace("gs", "file") - return pa_fs.LocalFileSystem.from_uri(to_local) - - with monkeypatch.context() as m: - m.setattr(pa_fs, "FileSystem", MockFileSystem) - yield - - @td.skip_if_no("gcsfs") @pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) -def test_to_read_gcs(gcs_buffer, format, mock_pa_filesystem, capsys): +def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys): """ Test that many to/read functions support GCS. @@ -91,8 +75,19 @@ def test_to_read_gcs(gcs_buffer, format, mock_pa_filesystem, capsys): df2 = read_json(path, convert_dates=["dt"]) elif format == "parquet": pytest.importorskip("pyarrow") - df1.to_parquet(path) - df2 = read_parquet(path) + pa_fs = pytest.importorskip("pyarrow.fs") + + class MockFileSystem(pa_fs.FileSystem): + @staticmethod + def from_uri(path): + print("Using pyarrow filesystem") + to_local = path.replace("gs", "file") + return pa_fs.LocalFileSystem.from_uri(to_local)[0], to_local + + with monkeypatch.context() as m: + m.setattr(pa_fs, "FileSystem", MockFileSystem) + df1.to_parquet(path) + df2 = read_parquet(path) captured = capsys.readouterr() assert captured.out == "Using pyarrow filesystem\nUsing pyarrow filesystem\n" elif format == "markdown": From 31593f05f2b38e68f0cbe38ec38b47eb7bf7053d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 13 Mar 2023 11:25:35 -0700 Subject: [PATCH 21/24] Address review --- pandas/io/parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index db1ebee6ad1ba..05c599bad386e 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -106,10 +106,9 @@ def _get_path_or_handle( ) elif pa_fs is not None and isinstance(fs, pa_fs.FileSystem) and storage_options: raise NotImplementedError( - "storage_options not supported with a pyarrow Filesystem." + "storage_options not supported with a pyarrow FileSystem." ) if is_fsspec_url(path_or_handle) and fs is None: - fsspec = import_optional_dependency("fsspec") if storage_options is None: pa = import_optional_dependency("pyarrow") pa_fs = import_optional_dependency("pyarrow.fs") @@ -119,6 +118,7 @@ def _get_path_or_handle( except (TypeError, pa.ArrowInvalid): pass if fs is None: + fsspec = import_optional_dependency("fsspec") fs, path_or_handle = fsspec.core.url_to_fs( path_or_handle, **(storage_options or {}) ) From 646dbad4ab014a6cca8f62179cd456d807950b3f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 13 Mar 2023 12:48:48 -0700 Subject: [PATCH 22/24] Fix test --- pandas/tests/io/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fe11a33e0aa2a..f6ed90efee258 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1249,7 +1249,7 @@ def test_unsupported_pa_filesystem_storage_options(self): with tm.ensure_clean() as path: with pytest.raises( NotImplementedError, - match="storage_options not supported with a pyarrow Filesystem.", + match="storage_options not supported with a pyarrow FileSystem.", ): df.to_parquet( path, @@ -1262,7 +1262,7 @@ def test_unsupported_pa_filesystem_storage_options(self): pathlib.Path(path).write_bytes(b"foo") with pytest.raises( NotImplementedError, - match="storage_options not supported with a pyarrow Filesystem.", + match="storage_options not supported with a pyarrow FileSystem.", ): read_parquet( path, From 96ef2fb1a5e4cd266f7e903cbb0431ec7f26e8ea Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 13 Mar 2023 15:56:36 -0700 Subject: [PATCH 23/24] Use localfilesystem correctly --- pandas/tests/io/test_gcs.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 64b6c13d27dd6..5f058a2247725 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -1,5 +1,6 @@ from io import BytesIO import os +import pathlib import tarfile import zipfile @@ -81,8 +82,8 @@ class MockFileSystem(pa_fs.FileSystem): @staticmethod def from_uri(path): print("Using pyarrow filesystem") - to_local = path.replace("gs", "file") - return pa_fs.LocalFileSystem.from_uri(to_local)[0], to_local + to_local = pathlib.Path(path.replace("gs://", "")).resolve().as_uri() + return pa_fs.LocalFileSystem(to_local) with monkeypatch.context() as m: m.setattr(pa_fs, "FileSystem", MockFileSystem) From ae77a2615d8122edd79b03e6bbc13e25155f1cac Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 13 Mar 2023 19:51:33 -0700 Subject: [PATCH 24/24] use absolute --- pandas/tests/io/test_gcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 5f058a2247725..b65a19d766976 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -82,7 +82,7 @@ class MockFileSystem(pa_fs.FileSystem): @staticmethod def from_uri(path): print("Using pyarrow filesystem") - to_local = pathlib.Path(path.replace("gs://", "")).resolve().as_uri() + to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri() return pa_fs.LocalFileSystem(to_local) with monkeypatch.context() as m: