From 7e31a0474dea2a4f58561463979c2e46eece2b2b Mon Sep 17 00:00:00 2001 From: Brett Naul Date: Tue, 17 Apr 2018 17:37:24 -0700 Subject: [PATCH 1/3] Google Cloud Storage support using gcsfs --- ci/appveyor-27.yaml | 1 + ci/check_imports.py | 1 + ci/circle-36-locale_slow.yaml | 1 + ci/requirements-optional-conda.txt | 1 + ci/requirements-optional-pip.txt | 1 + ci/travis-27.yaml | 1 + ci/travis-36.yaml | 1 + doc/source/install.rst | 1 + doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/io/common.py | 19 ++++++++-- pandas/io/excel.py | 2 +- pandas/io/gcs.py | 16 +++++++++ pandas/io/json/json.py | 6 ++-- pandas/tests/io/test_gcs.py | 57 ++++++++++++++++++++++++++++++ pandas/util/_print_versions.py | 1 + 15 files changed, 104 insertions(+), 7 deletions(-) create mode 100644 pandas/io/gcs.py create mode 100644 pandas/tests/io/test_gcs.py diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml index cfc6a796bd77e..10511ac0e00ca 100644 --- a/ci/appveyor-27.yaml +++ b/ci/appveyor-27.yaml @@ -6,6 +6,7 @@ dependencies: - beautifulsoup4 - bottleneck - dateutil + - gcsfs - html5lib - jinja2=2.8 - lxml diff --git a/ci/check_imports.py b/ci/check_imports.py index d6f24ebcc4d3e..3f09290f8c375 100644 --- a/ci/check_imports.py +++ b/ci/check_imports.py @@ -5,6 +5,7 @@ blacklist = { 'bs4', + 'gcsfs', 'html5lib', 'ipython', 'jinja2' diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml index cc852c1e2aeeb..f44e98e1ee09d 100644 --- a/ci/circle-36-locale_slow.yaml +++ b/ci/circle-36-locale_slow.yaml @@ -5,6 +5,7 @@ channels: dependencies: - beautifulsoup4 - cython + - gcsfs - html5lib - ipython - jinja2 diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index e8cfcdf80f2e8..9e4e8e99b5205 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -3,6 +3,7 @@ blosc bottleneck fastparquet feather-format +gcsfs html5lib ipython>=5.6.0 ipykernel diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 877c52fa0b4fd..3cce3f5339883 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -5,6 +5,7 @@ blosc bottleneck fastparquet feather-format +gcsfs html5lib ipython>=5.6.0 ipykernel diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index 22b993a2da886..482b888b88062 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -9,6 +9,7 @@ dependencies: - fastparquet - feather-format - flake8=3.4.1 + - gcsfs - html5lib - ipython - jemalloc=4.5.0.post diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index 006276ba1a65f..ff4f1a4a86f99 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -8,6 +8,7 @@ dependencies: - dask - fastparquet - feather-format + - gcsfs - geopandas - html5lib - ipython diff --git a/doc/source/install.rst b/doc/source/install.rst index fa6b9f4fc7f4d..a8c5194124829 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -276,6 +276,7 @@ Optional Dependencies * `Jinja2 `__: Template engine for conditional HTML formatting. * `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.7). * `blosc `__: for msgpack compression using ``blosc`` +* `gcsfs `__: necessary for Google Cloud Storage access (gcsfs >= 0.1.0). * One of `qtpy `__ (requires PyQt or PySide), `PyQt5 `__, diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a63276efc5b7c..0fe036a2ee70f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -18,7 +18,7 @@ Other Enhancements - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`) - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) - +- Added support for reading from Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`) .. _whatsnew_0240.api_breaking: diff --git a/pandas/io/common.py b/pandas/io/common.py index ac9077f2db50e..6d579fc8a8a09 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -88,7 +88,7 @@ def _is_url(url): """ try: return parse_url(url).scheme in _VALID_URLS - except: + except Exception: return False @@ -165,7 +165,15 @@ def is_s3_url(url): """Check for an s3, s3n, or s3a url""" try: return parse_url(url).scheme in ['s3', 's3n', 's3a'] - except: # noqa + except Exception: + return False + + +def is_gcs_url(url): + """Check for a gcs url""" + try: + return parse_url(url).scheme in ['gcs', 'gs'] + except Exception: return False @@ -208,6 +216,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=compression, mode=mode) + if is_gcs_url(filepath_or_buffer): + from pandas.io import gcs + return gcs.get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression, + mode=mode) + if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index e86d33742b266..793a95ffb0ee7 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -46,7 +46,7 @@ io : string, path object (pathlib.Path or py._path.local.LocalPath), file-like object, pandas ExcelFile, or xlrd workbook. The string could be a URL. Valid URL schemes include http, ftp, s3, - and file. For file URLs, a host is expected. For instance, a local + gcs, and file. For file URLs, a host is expected. For instance, a local file could be file://localhost/path/to/workbook.xlsx sheet_name : string, int, mixed list of strings/ints, or None, default 0 diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py new file mode 100644 index 0000000000000..aa1cb648f05d1 --- /dev/null +++ b/pandas/io/gcs.py @@ -0,0 +1,16 @@ +""" GCS support for remote file interactivity """ +try: + import gcsfs +except ImportError: + raise ImportError("The gcsfs library is required to handle GCS files") + + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None, + compression=None, mode=None): + + if mode is None: + mode = 'rb' + + fs = gcsfs.GCSFileSystem() + filepath_or_buffer = fs.open(filepath_or_buffer, mode) + return filepath_or_buffer, None, compression, True diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 1627b2f4d3ec3..9992be521d61f 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -231,9 +231,9 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, Parameters ---------- path_or_buf : a valid JSON string or file-like, default: None - The string could be a URL. Valid URL schemes include http, ftp, s3, and - file. For file URLs, a host is expected. For instance, a local file - could be ``file://localhost/path/to/table.json`` + The string could be a URL. Valid URL schemes include http, ftp, s3, + gcs, and file. For file URLs, a host is expected. For instance, a local + file could be ``file://localhost/path/to/table.json`` orient : string, Indication of expected JSON string format. diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py new file mode 100644 index 0000000000000..a963cf0ed0257 --- /dev/null +++ b/pandas/tests/io/test_gcs.py @@ -0,0 +1,57 @@ +import numpy as np +import pytest + +from pandas import DataFrame, date_range, read_csv +from pandas.compat import StringIO +from pandas.io.common import is_gcs_url +from pandas.util import _test_decorators as td +from pandas.util.testing import assert_frame_equal + + +@pytest.fixture +def mock_patch(): + try: + from unittest.mock import patch + except ImportError: + from mock import patch + + return patch + + +def test_is_gcs_url(): + assert is_gcs_url("gcs://pandas/somethingelse.com") + assert is_gcs_url("gs://pandas/somethingelse.com") + assert not is_gcs_url("s3://pandas/somethingelse.com") + + +@td.skip_if_no('gcsfs') +def test_read_csv_gcs(mock_patch): + df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], + 'dt': date_range('2018-06-18', periods=2)}) + with mock_patch('gcsfs.GCSFileSystem') as MockFileSystem: + instance = MockFileSystem.return_value + instance.open.return_value = StringIO(df1.to_csv(index=False)) + df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + + assert_frame_equal(df1, df2) + + +@td.skip_if_no('gcsfs') +def test_gcs_get_filepath_or_buffer(mock_patch): + df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], + 'dt': date_range('2018-06-18', periods=2)}) + with mock_patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath: + MockGetFilepath.return_value = (StringIO(df1.to_csv(index=False)), + None, None, False) + df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + + assert_frame_equal(df1, df2) + assert MockGetFilepath.called + + +@pytest.mark.skipif(td.safe_import('gcsfs'), + reason='Only check when gcsfs not installed') +def test_gcs_not_present_exception(): + with pytest.raises(ImportError) as e: + read_csv('gs://test/test.csv') + assert 'gcsfs library is required' in str(e.value) diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 83c1433bf5c39..01198fc541e0c 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -96,6 +96,7 @@ def show_versions(as_json=False): ("fastparquet", lambda mod: mod.__version__), ("pandas_gbq", lambda mod: mod.__version__), ("pandas_datareader", lambda mod: mod.__version__), + ("gcsfs", lambda mod: mod.__version__), ] deps_blob = list() From e5aee91ec3d292875382941ab3a6c96f2842c1e1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Jun 2018 12:29:32 -0500 Subject: [PATCH 2/3] Fixture for mock --- pandas/conftest.py | 16 ++++++++++++++++ pandas/tests/dtypes/test_inference.py | 7 ++----- pandas/tests/io/parser/common.py | 9 +++------ pandas/tests/io/test_gcs.py | 18 ++++-------------- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d6b18db4e71f2..b4a599758417c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,5 @@ +import importlib + import pytest import numpy as np @@ -249,3 +251,17 @@ def any_int_dtype(request): """ return request.param + + +@pytest.fixture +def mock(): + """ + Fixture providing the 'mock' module. + + Uses 'unittest.mock' for Python 3. Attempts to import the 3rd party 'mock' + package for Python 2, skipping if not present. + """ + if PY3: + return importlib.import_module("unittest.mock") + else: + return pytest.importorskip("mock") diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b4f5d67530fbd..84ee100b336b0 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -128,7 +128,7 @@ def test_is_dict_like_fails(ll): assert not inference.is_dict_like(ll) -def test_is_file_like(): +def test_is_file_like(mock): class MockFile(object): pass @@ -166,10 +166,7 @@ class MockFile(object): # Iterator but no read / write attributes data = [1, 2, 3] assert not is_file(data) - - if PY3: - from unittest import mock - assert not is_file(mock.Mock()) + assert not is_file(mock.Mock()) @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index b39122e5e7906..6e1d3575a1481 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -1546,7 +1546,7 @@ def test_file_handles(self): assert not m.closed m.close() - def test_invalid_file_buffer(self): + def test_invalid_file_buffer(self, mock): # see gh-15337 class InvalidBuffer(object): @@ -1577,11 +1577,8 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) - if PY3: - from unittest import mock - - with tm.assert_raises_regex(ValueError, msg): - self.read_csv(mock.Mock()) + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(mock.Mock()) @tm.capture_stderr def test_skip_bad_lines(self): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index a963cf0ed0257..251c93df0733d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -8,16 +8,6 @@ from pandas.util.testing import assert_frame_equal -@pytest.fixture -def mock_patch(): - try: - from unittest.mock import patch - except ImportError: - from mock import patch - - return patch - - def test_is_gcs_url(): assert is_gcs_url("gcs://pandas/somethingelse.com") assert is_gcs_url("gs://pandas/somethingelse.com") @@ -25,10 +15,10 @@ def test_is_gcs_url(): @td.skip_if_no('gcsfs') -def test_read_csv_gcs(mock_patch): +def test_read_csv_gcs(mock): df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2)}) - with mock_patch('gcsfs.GCSFileSystem') as MockFileSystem: + with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem: instance = MockFileSystem.return_value instance.open.return_value = StringIO(df1.to_csv(index=False)) df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) @@ -37,10 +27,10 @@ def test_read_csv_gcs(mock_patch): @td.skip_if_no('gcsfs') -def test_gcs_get_filepath_or_buffer(mock_patch): +def test_gcs_get_filepath_or_buffer(mock): df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], 'dt': date_range('2018-06-18', periods=2)}) - with mock_patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath: + with mock.patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath: MockGetFilepath.return_value = (StringIO(df1.to_csv(index=False)), None, None, False) df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) From 2745359f0e75ea3b6d119705e072e176afba5897 Mon Sep 17 00:00:00 2001 From: Brett Naul Date: Mon, 25 Jun 2018 12:01:33 -0700 Subject: [PATCH 3/3] Fix PEP8 unused import --- pandas/tests/dtypes/test_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 84ee100b336b0..65527ac1b278f 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -20,7 +20,7 @@ DatetimeIndex, TimedeltaIndex, Timestamp, Panel, Period, Categorical, isna, Interval, DateOffset) -from pandas.compat import u, PY2, PY3, StringIO, lrange +from pandas.compat import u, PY2, StringIO, lrange from pandas.core.dtypes import inference from pandas.core.dtypes.common import ( is_timedelta64_dtype,