From 8844de3000493d788159612597cd7bbf0c31e2c6 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski Date: Thu, 25 Nov 2021 18:27:40 +0100 Subject: [PATCH 1/5] Check if fsspec url matches RFC 3986 schema --- pandas/io/common.py | 4 +++- pandas/tests/io/test_common.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 2102e67f06d36..34cc70db5bf2a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,6 +18,7 @@ import mmap import os from pathlib import Path +import re import tempfile from typing import ( IO, @@ -245,9 +246,10 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: Returns true if the given URL looks like something fsspec can handle """ + rfc_3986_pattern = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") return ( isinstance(url, str) - and "://" in url + and rfc_3986_pattern.match(url) and not url.startswith(("http://", "https://")) ) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a782f8dbbc76d..7be5a8d829de0 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -506,6 +506,7 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}") @pytest.mark.parametrize("encoding", [None, "utf-8"]) From 5481c7c5d943393cea5b7daab4b2218c3771925b Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski Date: Fri, 26 Nov 2021 12:01:43 +0100 Subject: [PATCH 2/5] Add tests and whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/common.py | 4 ++-- pandas/tests/io/json/test_pandas.py | 6 ++++++ pandas/tests/io/test_common.py | 4 ++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ad6bf64dcdfa8..2e9c21763a03a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -659,6 +659,7 @@ I/O - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) +- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing urls (:issue:`36271`) Period ^^^^^^ diff --git a/pandas/io/common.py b/pandas/io/common.py index 34cc70db5bf2a..723963d5b49e6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -63,6 +63,7 @@ _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") +_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer) @@ -246,10 +247,9 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: Returns true if the given URL looks like something fsspec can handle """ - rfc_3986_pattern = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://") return ( isinstance(url, str) - and rfc_3986_pattern.match(url) + and _RFC_3986_PATTERN.match(url) and not url.startswith(("http://", "https://")) ) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 747770ad78684..be3af0605e2ef 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1529,6 +1529,12 @@ def test_read_timezone_information(self): expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) tm.assert_series_equal(result, expected) + def test_read_json_with_fsspec_value(self): + # GH 36271 + result = read_json('{"url":{"0":"s3://example-fsspec"}}') + expected = DataFrame({"url": ["s3://example-fsspec"]}) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] ) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 7be5a8d829de0..2cedfc322bf6e 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -506,7 +506,11 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + # fsspec URL in string should not be recognized + assert not icom.is_fsspec_url("this is not fsspec://url") assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}") + # accept everything that conforms to RFC 3986 schema + assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") @pytest.mark.parametrize("encoding", [None, "utf-8"]) From 09c55c6d07691ea5ff9235f627e1559e4da0a0de Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski Date: Fri, 26 Nov 2021 14:17:02 +0100 Subject: [PATCH 3/5] Fix typing issue --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 723963d5b49e6..9bde7e3dcb160 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -249,7 +249,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: """ return ( isinstance(url, str) - and _RFC_3986_PATTERN.match(url) + and bool(_RFC_3986_PATTERN.match(url)) and not url.startswith(("http://", "https://")) ) From c46de818400c73c80212fd0136796650c33de7fd Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski Date: Mon, 27 Dec 2021 16:30:21 +0100 Subject: [PATCH 4/5] Parametrize test case --- pandas/tests/io/json/test_pandas.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5099f4dcf966b..0b8548f98b03b 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1527,10 +1527,19 @@ def test_read_timezone_information(self): expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) tm.assert_series_equal(result, expected) - def test_read_json_with_fsspec_value(self): + @pytest.mark.parametrize( + "url", + [ + "s3://example-fsspec/", + "gcs://another-fsspec/file.json", + "https://example-site.com/data", + "some-protocol://data.txt", + ], + ) + def test_read_json_with_url_value(self, url): # GH 36271 - result = read_json('{"url":{"0":"s3://example-fsspec"}}') - expected = DataFrame({"url": ["s3://example-fsspec"]}) + result = read_json(f'{{"url":{{"0":"{url}"}}}}') + expected = DataFrame({"url": [url]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( From 8e7adfe480796e50dd6b3fa85f419cd5dd2a6f44 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski Date: Mon, 27 Dec 2021 16:33:06 +0100 Subject: [PATCH 5/5] Update whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9d85e6f797a76..6f384abba3875 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -780,7 +780,7 @@ I/O - Bug in :func:`read_csv` not setting name of :class:`MultiIndex` columns correctly when ``index_col`` is not the first column (:issue:`38549`) - Bug in :func:`read_csv` silently ignoring errors when failing to create a memory-mapped file (:issue:`44766`) - Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`) -- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing urls (:issue:`36271`) +- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`) - Period