From 6aa72ca2b5200a948bb09be0314f823b3975cde3 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Tue, 13 Dec 2022 02:13:57 -0500 Subject: [PATCH 01/16] =?UTF-8?q?ENH:=20Add=20engine=20keyword=20to=20read?= =?UTF-8?q?=5Fjson=20to=20enable=20reading=20from=20pyarrow=C2=A0#48893?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/_typing.py | 3 + pandas/io/json/_json.py | 83 ++++++++++-- pandas/io/json/arrow_json_parser_wrapper.py | 35 +++++ pandas/tests/io/json/conftest.py | 24 ++++ pandas/tests/io/json/test_readlines.py | 136 +++++++++++++++----- 5 files changed, 234 insertions(+), 47 deletions(-) create mode 100644 pandas/io/json/arrow_json_parser_wrapper.py diff --git a/pandas/_typing.py b/pandas/_typing.py index 8d3044a978291..87979aba9ada4 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -324,6 +324,9 @@ def closed(self) -> bool: # read_csv engines CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"] +# read_json engines +JSONEngine = Literal["ujson", "pyarrow"] + # read_xml parsers XMLParsers = Literal["lxml", "etree"] diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index aa1342d0f135f..fe87cb5aea2c2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -31,6 +31,7 @@ DtypeArg, FilePath, IndexLabel, + JSONEngine, JSONSerializable, ReadBuffer, StorageOptions, @@ -72,6 +73,7 @@ build_table_schema, parse_table_schema, ) +from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper from pandas.io.parsers.readers import validate_integer if TYPE_CHECKING: @@ -392,6 +394,7 @@ def read_json( date_unit: str | None = ..., encoding: str | None = ..., encoding_errors: str | None = ..., + engine: JSONEngine = ..., lines: bool = ..., chunksize: int, compression: CompressionOptions = ..., @@ -421,6 +424,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., + engine: JSONEngine = ..., use_nullable_dtypes: bool = ..., ) -> JsonReader[Literal["series"]]: ... @@ -446,6 +450,7 @@ def read_json( nrows: int | None = ..., storage_options: StorageOptions = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> Series: ... @@ -470,6 +475,7 @@ def read_json( nrows: int | None = ..., storage_options: StorageOptions = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> DataFrame: ... @@ -497,6 +503,7 @@ def read_json( nrows: int | None = None, storage_options: StorageOptions = None, use_nullable_dtypes: bool = False, + engine: JSONEngine = "ujson", ) -> DataFrame | Series | JsonReader: """ Convert a JSON string to pandas object. @@ -605,6 +612,9 @@ def read_json( .. versionadded:: 1.3.0 + engine : {{'ujson', 'pyarrow'}}, default "ujson" + Parser engine to use. + lines : bool, default False Read the file as a json object per line. @@ -760,6 +770,7 @@ def read_json( storage_options=storage_options, encoding_errors=encoding_errors, use_nullable_dtypes=use_nullable_dtypes, + engine=engine, ) if chunksize: @@ -796,6 +807,7 @@ def __init__( storage_options: StorageOptions = None, encoding_errors: str | None = "strict", use_nullable_dtypes: bool = False, + engine: JSONEngine = "ujson", ) -> None: self.orient = orient @@ -807,6 +819,7 @@ def __init__( self.precise_float = precise_float self.date_unit = date_unit self.encoding = encoding + self.engine = engine self.compression = compression self.storage_options = storage_options self.lines = lines @@ -825,9 +838,48 @@ def __init__( self.nrows = validate_integer("nrows", self.nrows, 0) if not self.lines: raise ValueError("nrows can only be passed if lines=True") + if self.engine == "pyarrow": + if not self.lines: + raise ValueError( + "currently pyarrow engine only supports " + "the line-delimited JSON format" + ) + if self.engine not in ["pyarrow", "ujson"]: + raise ValueError( + f"The engine type {self.engine} is currently not supported." + ) + + if self.engine == "pyarrow": + self._engine = self._make_engine(filepath_or_buffer) + if self.engine == "ujson": + data = self._get_data_from_filepath(filepath_or_buffer) + self.data = self._preprocess_data(data) + + def _make_engine( + self, + filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + ) -> ArrowJsonParserWrapper: + + if not isinstance(filepath_or_buffer, list): + is_text = False + mode = "rb" + self.handles = get_handle( + self._get_data_from_filepath(filepath_or_buffer), + mode=mode, + encoding=self.encoding, + is_text=is_text, + compression=self.compression, + storage_options=self.storage_options, + errors=self.encoding_errors, + ) + filepath_or_buffer = self.handles.handle - data = self._get_data_from_filepath(filepath_or_buffer) - self.data = self._preprocess_data(data) + try: + return ArrowJsonParserWrapper(filepath_or_buffer) + except Exception: + if self.handles is not None: + self.handles.close() + raise def _preprocess_data(self, data): """ @@ -912,19 +964,22 @@ def read(self) -> DataFrame | Series: """ obj: DataFrame | Series with self: - if self.lines: - if self.chunksize: - obj = concat(self) - elif self.nrows: - lines = list(islice(self.data, self.nrows)) - lines_json = self._combine_lines(lines) - obj = self._get_object_parser(lines_json) + if self.engine == "pyarrow": + obj = self._engine.read() + if self.engine == "ujson": + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data_lines = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data_lines)) else: - data = ensure_str(self.data) - data_lines = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data_lines)) - else: - obj = self._get_object_parser(self.data) + obj = self._get_object_parser(self.data) if self.use_nullable_dtypes: return obj.convert_dtypes(infer_objects=False) else: diff --git a/pandas/io/json/arrow_json_parser_wrapper.py b/pandas/io/json/arrow_json_parser_wrapper.py new file mode 100644 index 0000000000000..2980ed2f2772e --- /dev/null +++ b/pandas/io/json/arrow_json_parser_wrapper.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pandas._typing import ReadBuffer +from pandas.compat._optional import import_optional_dependency + +if TYPE_CHECKING: + from pandas import DataFrame + + +class ArrowJsonParserWrapper: + """ + Wrapper for the pyarrow engine for read_json() + """ + + def __init__(self, src: ReadBuffer[bytes]) -> None: + self.src = src + + def read(self) -> DataFrame: + """ + Reads the contents of a JSON file into a DataFrame and + processes it according to the kwargs passed in the + constructor. + + Returns + ------- + DataFrame + The DataFrame created from the JSON file. + """ + pyarrow_json = import_optional_dependency("pyarrow.json") + table = pyarrow_json.read_json(self.src) + + frame = table.to_pandas() + return frame diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index 4e848cd48b42d..b600064d08a46 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -7,3 +7,27 @@ def orient(request): Fixture for orients excluding the table format. """ return request.param + + +@pytest.fixture +def json_dir_path(datapath): + """ + The directory path to the data files needed for parser tests. + """ + return datapath("io", "json", "data") + + +@pytest.fixture(params=["ujson", "pyarrow"]) +def engine(request): + return request.param + + +@pytest.fixture +def json_engine_pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + engine = request.getfixturevalue("engine") + if engine == "pyarrow": + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.node.add_marker(mark) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index a76627fb08147..9dab7c54d9a72 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,4 +1,5 @@ from io import StringIO +import os from pathlib import Path from typing import Iterator @@ -13,6 +14,8 @@ from pandas.io.json._json import JsonReader +xfail_json_engine_pyarrow = pytest.mark.usefixtures("json_engine_pyarrow_xfail") + @pytest.fixture def lines_json_df(): @@ -20,21 +23,37 @@ def lines_json_df(): return df.to_json(lines=True, orient="records") -def test_read_jsonl(): +# pyarrow takes file path as an input +# and only supports line delimited json +@xfail_json_engine_pyarrow +def test_read_jsonl(engine): # GH9180 - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True, engine=engine) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) -def test_read_datetime(): +def test_read_jsonl_engine_pyarrow(json_dir_path, engine): + result = read_json( + os.path.join(json_dir_path, "line_delimited.json"), + lines=True, + engine=engine, + ) + expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]}) + tm.assert_frame_equal(result, expected) + + +# pyarrow takes file path as an input +# and only supports line delimited json +@xfail_json_engine_pyarrow +def test_read_datetime(engine): # GH33787 df = DataFrame( [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")], columns=["accounts", "date", "name"], ) json_line = df.to_json(lines=True, orient="records") - result = read_json(json_line) + result = read_json(json_line, engine=engine) expected = DataFrame( [[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]], columns=["accounts", "date", "name"], @@ -42,20 +61,23 @@ def test_read_datetime(): tm.assert_frame_equal(result, expected) -def test_read_jsonl_unicode_chars(): +# pyarrow takes file path as an input +# and only supports line delimited json +@xfail_json_engine_pyarrow +def test_read_jsonl_unicode_chars(engine): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK # simulate file handle json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' json = StringIO(json) - result = read_json(json, lines=True) + result = read_json(json, lines=True, engine=engine) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) # simulate string json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - result = read_json(json, lines=True) + result = read_json(json, lines=True, engine=engine) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -89,56 +111,80 @@ def test_to_jsonl_count_new_lines(): assert actual_new_lines_count == expected_new_lines_count +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument @pytest.mark.parametrize("chunksize", [1, 1.0]) -def test_readjson_chunks(lines_json_df, chunksize): +@xfail_json_engine_pyarrow +def test_readjson_chunks(lines_json_df, chunksize, engine): # Basic test that read_json(chunks=True) gives the same result as # read_json(chunks=False) # GH17048: memory usage when lines=True unchunked = read_json(StringIO(lines_json_df), lines=True) - with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader: + with read_json( + StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine + ) as reader: chunked = pd.concat(reader) tm.assert_frame_equal(chunked, unchunked) -def test_readjson_chunksize_requires_lines(lines_json_df): +def test_readjson_chunksize_requires_lines(lines_json_df, engine): msg = "chunksize can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - with read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _: + with read_json( + StringIO(lines_json_df), lines=False, chunksize=2, engine=engine + ) as _: pass -def test_readjson_chunks_series(): +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow +def test_readjson_chunks_series(engine): # Test reading line-format JSON to Series with chunksize param s = pd.Series({"A": 1, "B": 2}) strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = read_json(strio, lines=True, typ="Series") + unchunked = read_json(strio, lines=True, typ="Series", engine=engine) strio = StringIO(s.to_json(lines=True, orient="records")) - with read_json(strio, lines=True, typ="Series", chunksize=1) as reader: + with read_json( + strio, lines=True, typ="Series", chunksize=1, engine=engine + ) as reader: chunked = pd.concat(reader) tm.assert_series_equal(chunked, unchunked) -def test_readjson_each_chunk(lines_json_df): +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow +def test_readjson_each_chunk(lines_json_df, engine): # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. - with read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader: + with read_json( + StringIO(lines_json_df), lines=True, chunksize=2, engine=engine + ) as reader: chunks = list(reader) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) -def test_readjson_chunks_from_file(): +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow +def test_readjson_chunks_from_file(engine): with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") - with read_json(path, lines=True, chunksize=1) as reader: + with read_json(path, lines=True, chunksize=1, engine=engine) as reader: chunked = pd.concat(reader) - unchunked = read_json(path, lines=True) + unchunked = read_json(path, lines=True, engine=engine) tm.assert_frame_equal(unchunked, chunked) @@ -171,16 +217,22 @@ def test_readjson_chunks_closes(chunksize): @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) -def test_readjson_invalid_chunksize(lines_json_df, chunksize): +def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): - with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as _: + with read_json( + StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine + ) as _: pass @pytest.mark.parametrize("chunksize", [None, 1, 2]) -def test_readjson_chunks_multiple_empty_lines(chunksize): +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow +def test_readjson_chunks_multiple_empty_lines(chunksize, engine): j = """ {"A":1,"B":4} @@ -198,52 +250,66 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): {"A":3,"B":6} """ orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - test = read_json(j, lines=True, chunksize=chunksize) + test = read_json(j, lines=True, chunksize=chunksize, engine=engine) if chunksize is not None: with test: test = pd.concat(test) tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") -def test_readjson_unicode(monkeypatch): +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow +def test_readjson_unicode(monkeypatch, engine): with tm.ensure_clean("test.json") as path: monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") with open(path, "w", encoding="utf-8") as f: f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') - result = read_json(path) + result = read_json(path, engine=engine) expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) tm.assert_frame_equal(result, expected) +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow @pytest.mark.parametrize("nrows", [1, 2]) -def test_readjson_nrows(nrows): +def test_readjson_nrows(nrows, engine): # GH 33916 # Test reading line-format JSON to Series with nrows param jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - result = read_json(jsonl, lines=True, nrows=nrows) + result = read_json(jsonl, lines=True, nrows=nrows, engine=engine) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(result, expected) +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) -def test_readjson_nrows_chunks(nrows, chunksize): +def test_readjson_nrows_chunks(nrows, chunksize, engine): # GH 33916 # Test reading line-format JSON to Series with nrows and chunksize param jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader: + with read_json( + jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine + ) as reader: chunked = pd.concat(reader) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(chunked, expected) -def test_readjson_nrows_requires_lines(): +def test_readjson_nrows_requires_lines(engine): # GH 33916 # Test ValuError raised if nrows is set without setting lines in read_json jsonl = """{"a": 1, "b": 2} @@ -252,10 +318,14 @@ def test_readjson_nrows_requires_lines(): {"a": 7, "b": 8}""" msg = "nrows can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - read_json(jsonl, lines=False, nrows=2) + read_json(jsonl, lines=False, nrows=2, engine=engine) -def test_readjson_lines_chunks_fileurl(datapath): +# pyarrow takes file path as an input +# and only supports line delimited json +# and doesn't have a chunksize argument +@xfail_json_engine_pyarrow +def test_readjson_lines_chunks_fileurl(datapath, engine): # GH 27135 # Test reading line-format JSON from file url df_list_expected = [ @@ -265,7 +335,7 @@ def test_readjson_lines_chunks_fileurl(datapath): ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = Path(os_path).as_uri() - with read_json(file_url, lines=True, chunksize=1) as url_reader: + with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader: for index, chuck in enumerate(url_reader): tm.assert_frame_equal(chuck, df_list_expected[index]) From c248b84d078bff2335f1ea9502dfea98e60c92f6 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Thu, 19 Jan 2023 22:51:31 -0500 Subject: [PATCH 02/16] moved argument to the end of signature, fixed elifs --- pandas/io/json/_json.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index fe87cb5aea2c2..3008ee19d4c84 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -394,13 +394,13 @@ def read_json( date_unit: str | None = ..., encoding: str | None = ..., encoding_errors: str | None = ..., - engine: JSONEngine = ..., lines: bool = ..., chunksize: int, compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> JsonReader[Literal["frame"]]: ... @@ -424,8 +424,8 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., - engine: JSONEngine = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> JsonReader[Literal["series"]]: ... @@ -851,7 +851,7 @@ def __init__( if self.engine == "pyarrow": self._engine = self._make_engine(filepath_or_buffer) - if self.engine == "ujson": + elif self.engine == "ujson": data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) From 91b81cfe5fe6ae80c72ab3876fd29867ec23a6e1 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Thu, 19 Jan 2023 22:53:21 -0500 Subject: [PATCH 03/16] Adding finally --- pandas/io/json/_json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3008ee19d4c84..d8201a60f8da0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -879,7 +879,8 @@ def _make_engine( except Exception: if self.handles is not None: self.handles.close() - raise + finally: + raise Exception def _preprocess_data(self, data): """ From 902066364c1c9b195d2a0688b0cf9e414788adb1 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Tue, 24 Jan 2023 23:28:54 -0500 Subject: [PATCH 04/16] Updated the _make_engine try-finally block --- pandas/io/json/_json.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index d8201a60f8da0..e70a6f1d18e98 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -876,11 +876,9 @@ def _make_engine( try: return ArrowJsonParserWrapper(filepath_or_buffer) - except Exception: + finally: if self.handles is not None: self.handles.close() - finally: - raise Exception def _preprocess_data(self, data): """ From 0388c4e7c55fa8f0624938cfcfd3fd2efeee2e17 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Tue, 13 Dec 2022 02:13:57 -0500 Subject: [PATCH 05/16] Fixing merge conflicts --- pandas/io/json/_json.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e70a6f1d18e98..ad3d8b7b07300 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -21,6 +21,9 @@ import numpy as np +from pandas._config import using_nullable_dtypes + +from pandas._libs import lib from pandas._libs.json import ( dumps, loads, @@ -502,7 +505,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: int | None = None, storage_options: StorageOptions = None, - use_nullable_dtypes: bool = False, + use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, engine: JSONEngine = "ujson", ) -> DataFrame | Series | JsonReader: """ @@ -742,6 +745,12 @@ def read_json( if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") + use_nullable_dtypes = ( + use_nullable_dtypes + if use_nullable_dtypes is not lib.no_default + else using_nullable_dtypes() + ) + if dtype is None and orient != "table": # error: Incompatible types in assignment (expression has type "bool", variable # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], From c660395fe5de1efbc5cfd57835ad4b1e1cf94166 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Tue, 13 Dec 2022 02:13:57 -0500 Subject: [PATCH 06/16] Refactored pyarrow engine code --- pandas/io/json/_json.py | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3653597efc742..aab2b7a8d8128 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -858,37 +858,12 @@ def __init__( f"The engine type {self.engine} is currently not supported." ) + data = self._get_data_from_filepath(filepath_or_buffer) if self.engine == "pyarrow": - self._engine = self._make_engine(filepath_or_buffer) + self.data = ArrowJsonParserWrapper(filepath_or_buffer) elif self.engine == "ujson": - data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) - def _make_engine( - self, - filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes], - ) -> ArrowJsonParserWrapper: - - if not isinstance(filepath_or_buffer, list): - is_text = False - mode = "rb" - self.handles = get_handle( - self._get_data_from_filepath(filepath_or_buffer), - mode=mode, - encoding=self.encoding, - is_text=is_text, - compression=self.compression, - storage_options=self.storage_options, - errors=self.encoding_errors, - ) - filepath_or_buffer = self.handles.handle - - try: - return ArrowJsonParserWrapper(filepath_or_buffer) - finally: - if self.handles is not None: - self.handles.close() - def _preprocess_data(self, data): """ At this point, the data either has a `read` attribute (e.g. a file @@ -973,7 +948,7 @@ def read(self) -> DataFrame | Series: obj: DataFrame | Series with self: if self.engine == "pyarrow": - obj = self._engine.read() + obj = self.data.read() if self.engine == "ujson": if self.lines: if self.chunksize: From eb709e7221f3d353c50dfb55e3f89f91e69838f1 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Tue, 13 Dec 2022 02:13:57 -0500 Subject: [PATCH 07/16] Refactored pyarrow implementation to inline --- pandas/io/json/_json.py | 18 ++++++++--- pandas/io/json/arrow_json_parser_wrapper.py | 35 --------------------- pandas/tests/io/json/test_readlines.py | 3 +- 3 files changed, 16 insertions(+), 40 deletions(-) delete mode 100644 pandas/io/json/arrow_json_parser_wrapper.py diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index aab2b7a8d8128..86f145c59e968 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -40,6 +40,7 @@ StorageOptions, WriteBuffer, ) +from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc @@ -76,7 +77,6 @@ build_table_schema, parse_table_schema, ) -from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper from pandas.io.parsers.readers import validate_integer if TYPE_CHECKING: @@ -661,6 +661,10 @@ def read_json( .. versionadded:: 2.0 + engine : {{'ujson', 'pyarrow'}}, default "ujson" + Parser engine to use. + + Returns ------- Series or DataFrame @@ -843,6 +847,10 @@ def __init__( self.chunksize = validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") + if self.engine == "pyarrow": + raise ValueError( + "currently pyarrow engine doesn't support chunksize parameter" + ) if self.nrows is not None: self.nrows = validate_integer("nrows", self.nrows, 0) if not self.lines: @@ -858,10 +866,10 @@ def __init__( f"The engine type {self.engine} is currently not supported." ) - data = self._get_data_from_filepath(filepath_or_buffer) if self.engine == "pyarrow": - self.data = ArrowJsonParserWrapper(filepath_or_buffer) + self.data = filepath_or_buffer elif self.engine == "ujson": + data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) def _preprocess_data(self, data): @@ -948,7 +956,9 @@ def read(self) -> DataFrame | Series: obj: DataFrame | Series with self: if self.engine == "pyarrow": - obj = self.data.read() + pyarrow_json = import_optional_dependency("pyarrow.json") + table = pyarrow_json.read_json(self.data) + obj = table.to_pandas() if self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/json/arrow_json_parser_wrapper.py b/pandas/io/json/arrow_json_parser_wrapper.py deleted file mode 100644 index 2980ed2f2772e..0000000000000 --- a/pandas/io/json/arrow_json_parser_wrapper.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from pandas._typing import ReadBuffer -from pandas.compat._optional import import_optional_dependency - -if TYPE_CHECKING: - from pandas import DataFrame - - -class ArrowJsonParserWrapper: - """ - Wrapper for the pyarrow engine for read_json() - """ - - def __init__(self, src: ReadBuffer[bytes]) -> None: - self.src = src - - def read(self) -> DataFrame: - """ - Reads the contents of a JSON file into a DataFrame and - processes it according to the kwargs passed in the - constructor. - - Returns - ------- - DataFrame - The DataFrame created from the JSON file. - """ - pyarrow_json = import_optional_dependency("pyarrow.json") - table = pyarrow_json.read_json(self.src) - - frame = table.to_pandas() - return frame diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 9dab7c54d9a72..3a7dd3cde5f29 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -34,6 +34,7 @@ def test_read_jsonl(engine): def test_read_jsonl_engine_pyarrow(json_dir_path, engine): + print(os.path.join(json_dir_path, "line_delimited.json")) result = read_json( os.path.join(json_dir_path, "line_delimited.json"), lines=True, @@ -114,8 +115,8 @@ def test_to_jsonl_count_new_lines(): # pyarrow takes file path as an input # and only supports line delimited json # and doesn't have a chunksize argument -@pytest.mark.parametrize("chunksize", [1, 1.0]) @xfail_json_engine_pyarrow +@pytest.mark.parametrize("chunksize", [1, 1.0]) def test_readjson_chunks(lines_json_df, chunksize, engine): # Basic test that read_json(chunks=True) gives the same result as # read_json(chunks=False) From 7eccd8323831e2abf68941c329a2e523bbaa4a10 Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Tue, 31 Jan 2023 23:58:17 -0500 Subject: [PATCH 08/16] Small refactors --- pandas/io/json/_json.py | 7 ++----- pandas/tests/io/json/test_readlines.py | 1 - 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f1b1b2dc28781..d8d66ad9ebcb4 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -615,9 +615,6 @@ def read_json( .. versionadded:: 1.3.0 - engine : {{'ujson', 'pyarrow'}}, default "ujson" - Parser engine to use. - lines : bool, default False Read the file as a json object per line. @@ -863,7 +860,7 @@ def __init__( "currently pyarrow engine only supports " "the line-delimited JSON format" ) - if self.engine not in ["pyarrow", "ujson"]: + if self.engine not in {"pyarrow", "ujson"}: raise ValueError( f"The engine type {self.engine} is currently not supported." ) @@ -961,7 +958,7 @@ def read(self) -> DataFrame | Series: pyarrow_json = import_optional_dependency("pyarrow.json") table = pyarrow_json.read_json(self.data) obj = table.to_pandas() - if self.engine == "ujson": + elif self.engine == "ujson": if self.lines: if self.chunksize: obj = concat(self) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 3a7dd3cde5f29..0701ebf9dee19 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -34,7 +34,6 @@ def test_read_jsonl(engine): def test_read_jsonl_engine_pyarrow(json_dir_path, engine): - print(os.path.join(json_dir_path, "line_delimited.json")) result = read_json( os.path.join(json_dir_path, "line_delimited.json"), lines=True, From b4409bb83234e5fb407b6998dc882311042b11ce Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Wed, 1 Feb 2023 12:04:45 -0500 Subject: [PATCH 09/16] Fixing double lines --- pandas/io/json/_json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index d8d66ad9ebcb4..bf03510b45a37 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -663,7 +663,6 @@ def read_json( engine : {{'ujson', 'pyarrow'}}, default "ujson" Parser engine to use. - Returns ------- Series or DataFrame From 38bc7dbe83bff23e7528da214d197befa9cca64d Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Thu, 2 Feb 2023 09:49:57 -0500 Subject: [PATCH 10/16] - Added the logic for skipping test if pyarrow is not installed. - Added reason to fail to each pyarrow xfail test. --- pandas/tests/io/json/conftest.py | 4 +- pandas/tests/io/json/test_readlines.py | 131 ++++++++++++++++--------- 2 files changed, 89 insertions(+), 46 deletions(-) diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index b600064d08a46..66d9d199f429f 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -19,7 +19,9 @@ def json_dir_path(datapath): @pytest.fixture(params=["ujson", "pyarrow"]) def engine(request): - return request.param + if request.param == "pyarrow": + pytest.importorskip("pyarrow.json") + return request.param @pytest.fixture diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 0701ebf9dee19..9073afbd601cb 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -23,11 +23,13 @@ def lines_json_df(): return df.to_json(lines=True, orient="records") -# pyarrow takes file path as an input -# and only supports line delimited json @xfail_json_engine_pyarrow -def test_read_jsonl(engine): +def test_read_jsonl(request, engine): # GH9180 + if engine == "pyarrow": + reason = "Pyarrow only supports a file path as an input and line delimited json" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True, engine=engine) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -43,11 +45,13 @@ def test_read_jsonl_engine_pyarrow(json_dir_path, engine): tm.assert_frame_equal(result, expected) -# pyarrow takes file path as an input -# and only supports line delimited json @xfail_json_engine_pyarrow -def test_read_datetime(engine): +def test_read_datetime(request, engine): # GH33787 + if engine == "pyarrow": + reason = "Pyarrow only supports a file path as an input and line delimited json" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + df = DataFrame( [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")], columns=["accounts", "date", "name"], @@ -61,12 +65,13 @@ def test_read_datetime(engine): tm.assert_frame_equal(result, expected) -# pyarrow takes file path as an input -# and only supports line delimited json @xfail_json_engine_pyarrow -def test_read_jsonl_unicode_chars(engine): +def test_read_jsonl_unicode_chars(request, engine): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK + if engine == "pyarrow": + reason = "Pyarrow only supports a file path as an input and line delimited json" + request.node.add_marker(pytest.mark.xfail(reason=reason)) # simulate file handle json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' @@ -111,16 +116,20 @@ def test_to_jsonl_count_new_lines(): assert actual_new_lines_count == expected_new_lines_count -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow @pytest.mark.parametrize("chunksize", [1, 1.0]) -def test_readjson_chunks(lines_json_df, chunksize, engine): +def test_readjson_chunks(request, lines_json_df, chunksize, engine): # Basic test that read_json(chunks=True) gives the same result as # read_json(chunks=False) # GH17048: memory usage when lines=True + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + unchunked = read_json(StringIO(lines_json_df), lines=True) with read_json( StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine @@ -139,11 +148,15 @@ def test_readjson_chunksize_requires_lines(lines_json_df, engine): pass -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow -def test_readjson_chunks_series(engine): +def test_readjson_chunks_series(request, engine): + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + # Test reading line-format JSON to Series with chunksize param s = pd.Series({"A": 1, "B": 2}) @@ -159,11 +172,15 @@ def test_readjson_chunks_series(engine): tm.assert_series_equal(chunked, unchunked) -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow -def test_readjson_each_chunk(lines_json_df, engine): +def test_readjson_each_chunk(request, lines_json_df, engine): + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. with read_json( @@ -174,11 +191,15 @@ def test_readjson_each_chunk(lines_json_df, engine): assert chunks[1].shape == (1, 2) -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow -def test_readjson_chunks_from_file(engine): +def test_readjson_chunks_from_file(request, engine): + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") @@ -228,11 +249,15 @@ def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine): @pytest.mark.parametrize("chunksize", [None, 1, 2]) -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow -def test_readjson_chunks_multiple_empty_lines(chunksize, engine): +def test_readjson_chunks_multiple_empty_lines(request, chunksize, engine): + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + j = """ {"A":1,"B":4} @@ -257,11 +282,15 @@ def test_readjson_chunks_multiple_empty_lines(chunksize, engine): tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow -def test_readjson_unicode(monkeypatch, engine): +def test_readjson_unicode(request, monkeypatch, engine): + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + with tm.ensure_clean("test.json") as path: monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") with open(path, "w", encoding="utf-8") as f: @@ -272,14 +301,18 @@ def test_readjson_unicode(monkeypatch, engine): tm.assert_frame_equal(result, expected) -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow @pytest.mark.parametrize("nrows", [1, 2]) -def test_readjson_nrows(nrows, engine): +def test_readjson_nrows(request, nrows, engine): # GH 33916 # Test reading line-format JSON to Series with nrows param + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} @@ -289,14 +322,18 @@ def test_readjson_nrows(nrows, engine): tm.assert_frame_equal(result, expected) -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) -def test_readjson_nrows_chunks(nrows, chunksize, engine): +def test_readjson_nrows_chunks(request, nrows, chunksize, engine): # GH 33916 # Test reading line-format JSON to Series with nrows and chunksize param + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} @@ -321,13 +358,17 @@ def test_readjson_nrows_requires_lines(engine): read_json(jsonl, lines=False, nrows=2, engine=engine) -# pyarrow takes file path as an input -# and only supports line delimited json -# and doesn't have a chunksize argument @xfail_json_engine_pyarrow -def test_readjson_lines_chunks_fileurl(datapath, engine): +def test_readjson_lines_chunks_fileurl(request, datapath, engine): # GH 27135 # Test reading line-format JSON from file url + if engine == "pyarrow": + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + df_list_expected = [ DataFrame([[1, 2]], columns=["a", "b"], index=[0]), DataFrame([[3, 4]], columns=["a", "b"], index=[1]), From bed15df6618d35feab00ed9f0066becff07f09ac Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Thu, 2 Feb 2023 10:37:05 -0500 Subject: [PATCH 11/16] Added else statement to conftest engine --- pandas/tests/io/json/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index 66d9d199f429f..5488894e317e2 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -22,6 +22,8 @@ def engine(request): if request.param == "pyarrow": pytest.importorskip("pyarrow.json") return request.param + else: + return request.param @pytest.fixture From cdfd747c2638c246dbaae0ccaeeaaf3202eed57b Mon Sep 17 00:00:00 2001 From: Arda Kosar Date: Thu, 2 Feb 2023 13:23:52 -0500 Subject: [PATCH 12/16] - removed xfail decorators - removed xfail fixture from conftest.py --- pandas/tests/io/json/conftest.py | 11 ----------- pandas/tests/io/json/test_readlines.py | 14 -------------- 2 files changed, 25 deletions(-) diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index 5488894e317e2..8e2b167e576df 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -24,14 +24,3 @@ def engine(request): return request.param else: return request.param - - -@pytest.fixture -def json_engine_pyarrow_xfail(request): - """ - Fixture that xfails a test if the engine is pyarrow. - """ - engine = request.getfixturevalue("engine") - if engine == "pyarrow": - mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") - request.node.add_marker(mark) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 9073afbd601cb..beecb546ae321 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -14,8 +14,6 @@ from pandas.io.json._json import JsonReader -xfail_json_engine_pyarrow = pytest.mark.usefixtures("json_engine_pyarrow_xfail") - @pytest.fixture def lines_json_df(): @@ -23,7 +21,6 @@ def lines_json_df(): return df.to_json(lines=True, orient="records") -@xfail_json_engine_pyarrow def test_read_jsonl(request, engine): # GH9180 if engine == "pyarrow": @@ -45,7 +42,6 @@ def test_read_jsonl_engine_pyarrow(json_dir_path, engine): tm.assert_frame_equal(result, expected) -@xfail_json_engine_pyarrow def test_read_datetime(request, engine): # GH33787 if engine == "pyarrow": @@ -65,7 +61,6 @@ def test_read_datetime(request, engine): tm.assert_frame_equal(result, expected) -@xfail_json_engine_pyarrow def test_read_jsonl_unicode_chars(request, engine): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK @@ -116,7 +111,6 @@ def test_to_jsonl_count_new_lines(): assert actual_new_lines_count == expected_new_lines_count -@xfail_json_engine_pyarrow @pytest.mark.parametrize("chunksize", [1, 1.0]) def test_readjson_chunks(request, lines_json_df, chunksize, engine): # Basic test that read_json(chunks=True) gives the same result as @@ -148,7 +142,6 @@ def test_readjson_chunksize_requires_lines(lines_json_df, engine): pass -@xfail_json_engine_pyarrow def test_readjson_chunks_series(request, engine): if engine == "pyarrow": reason = ( @@ -172,7 +165,6 @@ def test_readjson_chunks_series(request, engine): tm.assert_series_equal(chunked, unchunked) -@xfail_json_engine_pyarrow def test_readjson_each_chunk(request, lines_json_df, engine): if engine == "pyarrow": reason = ( @@ -191,7 +183,6 @@ def test_readjson_each_chunk(request, lines_json_df, engine): assert chunks[1].shape == (1, 2) -@xfail_json_engine_pyarrow def test_readjson_chunks_from_file(request, engine): if engine == "pyarrow": reason = ( @@ -249,7 +240,6 @@ def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine): @pytest.mark.parametrize("chunksize", [None, 1, 2]) -@xfail_json_engine_pyarrow def test_readjson_chunks_multiple_empty_lines(request, chunksize, engine): if engine == "pyarrow": reason = ( @@ -282,7 +272,6 @@ def test_readjson_chunks_multiple_empty_lines(request, chunksize, engine): tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") -@xfail_json_engine_pyarrow def test_readjson_unicode(request, monkeypatch, engine): if engine == "pyarrow": reason = ( @@ -301,7 +290,6 @@ def test_readjson_unicode(request, monkeypatch, engine): tm.assert_frame_equal(result, expected) -@xfail_json_engine_pyarrow @pytest.mark.parametrize("nrows", [1, 2]) def test_readjson_nrows(request, nrows, engine): # GH 33916 @@ -322,7 +310,6 @@ def test_readjson_nrows(request, nrows, engine): tm.assert_frame_equal(result, expected) -@xfail_json_engine_pyarrow @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) def test_readjson_nrows_chunks(request, nrows, chunksize, engine): # GH 33916 @@ -358,7 +345,6 @@ def test_readjson_nrows_requires_lines(engine): read_json(jsonl, lines=False, nrows=2, engine=engine) -@xfail_json_engine_pyarrow def test_readjson_lines_chunks_fileurl(request, datapath, engine): # GH 27135 # Test reading line-format JSON from file url From ab7af44bdbfdfdedc5c0a205ca01c145bdb1348b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Feb 2023 12:47:30 -0800 Subject: [PATCH 13/16] add whatsnew, address comments --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/io/json/_json.py | 34 +++++++++++---- pandas/tests/io/json/conftest.py | 12 +----- pandas/tests/io/json/test_pandas.py | 7 ++++ pandas/tests/io/json/test_readlines.py | 58 ++++++++------------------ 5 files changed, 52 insertions(+), 60 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9fd9faf057a8a..296c24469df4e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -298,6 +298,7 @@ Other enhancements - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`) - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) +- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) - Added support for SQLAlchemy 2.0 (:issue:`40686`) - diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 826186612d5ab..093d90f90c54a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -21,7 +21,10 @@ import numpy as np -from pandas._config import using_nullable_dtypes +from pandas._config import ( + get_option, + using_nullable_dtypes, +) from pandas._libs import lib from pandas._libs.json import ( @@ -841,6 +844,10 @@ def __init__( self.handles: IOHandles[str] | None = None self.use_nullable_dtypes = use_nullable_dtypes + if self.engine not in {"pyarrow", "ujson"}: + raise ValueError( + f"The engine type {self.engine} is currently not supported." + ) if self.chunksize is not None: self.chunksize = validate_integer("chunksize", self.chunksize, 1) if not self.lines: @@ -859,12 +866,6 @@ def __init__( "currently pyarrow engine only supports " "the line-delimited JSON format" ) - if self.engine not in {"pyarrow", "ujson"}: - raise ValueError( - f"The engine type {self.engine} is currently not supported." - ) - - if self.engine == "pyarrow": self.data = filepath_or_buffer elif self.engine == "ujson": data = self._get_data_from_filepath(filepath_or_buffer) @@ -955,8 +956,23 @@ def read(self) -> DataFrame | Series: with self: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") - table = pyarrow_json.read_json(self.data) - obj = table.to_pandas() + pa_table = pyarrow_json.read_json(self.data) + if ( + self.use_nullable_dtypes + and get_option("mode.dtype_backend") == "pyarrow" + ): + from pandas.arrays import ArrowExtensionArray + + obj = DataFrame( + { + col_name: ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } + ) + return obj + obj = pa_table.to_pandas() elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index 8e2b167e576df..f3736252e850a 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -9,18 +9,8 @@ def orient(request): return request.param -@pytest.fixture -def json_dir_path(datapath): - """ - The directory path to the data files needed for parser tests. - """ - return datapath("io", "json", "data") - - @pytest.fixture(params=["ujson", "pyarrow"]) def engine(request): if request.param == "pyarrow": pytest.importorskip("pyarrow.json") - return request.param - else: - return request.param + return request.param diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f59e1e8cbe43d..9728afd09e8b5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1956,3 +1956,10 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True))) tm.assert_series_equal(result, expected) + + +def test_invalid_engine(): + ser = Series(range(1)) + out = ser.to_json() + with pytest.raises(ValueError, match="The engine type foo"): + read_json(out, engine="foo") diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index beecb546ae321..e4d68688ebf28 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,5 +1,4 @@ from io import StringIO -import os from pathlib import Path from typing import Iterator @@ -21,20 +20,16 @@ def lines_json_df(): return df.to_json(lines=True, orient="records") -def test_read_jsonl(request, engine): +def test_read_jsonl(): # GH9180 - if engine == "pyarrow": - reason = "Pyarrow only supports a file path as an input and line delimited json" - request.node.add_marker(pytest.mark.xfail(reason=reason)) - - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True, engine=engine) + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) -def test_read_jsonl_engine_pyarrow(json_dir_path, engine): +def test_read_jsonl_engine_pyarrow(datapath, engine): result = read_json( - os.path.join(json_dir_path, "line_delimited.json"), + datapath("io", "json", "data", "line_delimited.json"), lines=True, engine=engine, ) @@ -46,7 +41,7 @@ def test_read_datetime(request, engine): # GH33787 if engine == "pyarrow": reason = "Pyarrow only supports a file path as an input and line delimited json" - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) df = DataFrame( [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")], @@ -61,23 +56,20 @@ def test_read_datetime(request, engine): tm.assert_frame_equal(result, expected) -def test_read_jsonl_unicode_chars(request, engine): +def test_read_jsonl_unicode_chars(): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK - if engine == "pyarrow": - reason = "Pyarrow only supports a file path as an input and line delimited json" - request.node.add_marker(pytest.mark.xfail(reason=reason)) # simulate file handle json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' json = StringIO(json) - result = read_json(json, lines=True, engine=engine) + result = read_json(json, lines=True) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) # simulate string json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' - result = read_json(json, lines=True, engine=engine) + result = read_json(json, lines=True) expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -122,7 +114,7 @@ def test_readjson_chunks(request, lines_json_df, chunksize, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) unchunked = read_json(StringIO(lines_json_df), lines=True) with read_json( @@ -171,7 +163,7 @@ def test_readjson_each_chunk(request, lines_json_df, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. @@ -189,7 +181,7 @@ def test_readjson_chunks_from_file(request, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) @@ -240,14 +232,7 @@ def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine): @pytest.mark.parametrize("chunksize", [None, 1, 2]) -def test_readjson_chunks_multiple_empty_lines(request, chunksize, engine): - if engine == "pyarrow": - reason = ( - "Pyarrow only supports a file path as an input and line delimited json" - "and doesn't support chunksize parameter." - ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) - +def test_readjson_chunks_multiple_empty_lines(chunksize): j = """ {"A":1,"B":4} @@ -265,7 +250,7 @@ def test_readjson_chunks_multiple_empty_lines(request, chunksize, engine): {"A":3,"B":6} """ orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - test = read_json(j, lines=True, chunksize=chunksize, engine=engine) + test = read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: with test: test = pd.concat(test) @@ -278,7 +263,7 @@ def test_readjson_unicode(request, monkeypatch, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) with tm.ensure_clean("test.json") as path: monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") @@ -291,21 +276,14 @@ def test_readjson_unicode(request, monkeypatch, engine): @pytest.mark.parametrize("nrows", [1, 2]) -def test_readjson_nrows(request, nrows, engine): +def test_readjson_nrows(nrows, engine): # GH 33916 # Test reading line-format JSON to Series with nrows param - if engine == "pyarrow": - reason = ( - "Pyarrow only supports a file path as an input and line delimited json" - "and doesn't support chunksize parameter." - ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) - jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - result = read_json(jsonl, lines=True, nrows=nrows, engine=engine) + result = read_json(jsonl, lines=True, nrows=nrows) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(result, expected) @@ -319,7 +297,7 @@ def test_readjson_nrows_chunks(request, nrows, chunksize, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} @@ -353,7 +331,7 @@ def test_readjson_lines_chunks_fileurl(request, datapath, engine): "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." ) - request.node.add_marker(pytest.mark.xfail(reason=reason)) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) df_list_expected = [ DataFrame([[1, 2]], columns=["a", "b"], index=[0]), From 8c9655346b2b6e0eab95310c305c93e2212a27e4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Feb 2023 14:53:18 -0800 Subject: [PATCH 14/16] address review --- doc/source/user_guide/io.rst | 10 ++++++ pandas/io/json/_json.py | 49 ++++++++++++++------------ pandas/tests/io/json/test_pandas.py | 1 + pandas/tests/io/json/test_readlines.py | 8 +++++ 4 files changed, 46 insertions(+), 22 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1c3cdd9f4cffd..d654b1aed6836 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2250,6 +2250,16 @@ For line-delimited json files, pandas can also return an iterator which reads in for chunk in reader: print(chunk) +Line-limited json can also be read using the pyarrow reader by specifying ``engine="pyarrow"``. + +.. ipython:: python + + from io import BytesIO + df = pd.read_json(BytesIO(jsonl.encode()), lines=True, engine="pyarrow") + df + +.. versionadded:: 2.0.0 + .. _io.table_schema: Table schema diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 093d90f90c54a..1494b11125319 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -663,8 +663,11 @@ def read_json( .. versionadded:: 2.0 - engine : {{'ujson', 'pyarrow'}}, default "ujson" - Parser engine to use. + engine : {{"ujson", "pyarrow"}}, default "ujson" + Parser engine to use. The ``"pyarrow"`` engine is only available when + ``lines=True``. + + .. versionadded:: 2.0 Returns ------- @@ -957,22 +960,24 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - if ( - self.use_nullable_dtypes - and get_option("mode.dtype_backend") == "pyarrow" - ): - from pandas.arrays import ArrowExtensionArray - - obj = DataFrame( - { - col_name: ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) - return obj - obj = pa_table.to_pandas() + if self.use_nullable_dtypes: + if get_option("mode.dtype_backend") == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + return DataFrame( + { + col_name: ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } + ) + elif get_option("mode.dtype_backend") == "pandas": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() + return pa_table.to_pandas(types_mapper=mapping.get) + return pa_table.to_pandas() elif self.engine == "ujson": if self.lines: if self.chunksize: @@ -987,10 +992,10 @@ def read(self) -> DataFrame | Series: obj = self._get_object_parser(self._combine_lines(data_lines)) else: obj = self._get_object_parser(self.data) - if self.use_nullable_dtypes: - return obj.convert_dtypes(infer_objects=False) - else: - return obj + if self.use_nullable_dtypes: + return obj.convert_dtypes(infer_objects=False) + else: + return obj def _get_object_parser(self, json) -> DataFrame | Series: """ diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9728afd09e8b5..3e361ad6852f9 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1959,6 +1959,7 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): def test_invalid_engine(): + # GH 48893 ser = Series(range(1)) out = ser.to_json() with pytest.raises(ValueError, match="The engine type foo"): diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index e4d68688ebf28..9b36423be73dd 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -40,6 +40,7 @@ def test_read_jsonl_engine_pyarrow(datapath, engine): def test_read_datetime(request, engine): # GH33787 if engine == "pyarrow": + # GH 48893 reason = "Pyarrow only supports a file path as an input and line delimited json" request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) @@ -110,6 +111,7 @@ def test_readjson_chunks(request, lines_json_df, chunksize, engine): # GH17048: memory usage when lines=True if engine == "pyarrow": + # GH 48893 reason = ( "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." @@ -136,6 +138,7 @@ def test_readjson_chunksize_requires_lines(lines_json_df, engine): def test_readjson_chunks_series(request, engine): if engine == "pyarrow": + # GH 48893 reason = ( "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." @@ -159,6 +162,7 @@ def test_readjson_chunks_series(request, engine): def test_readjson_each_chunk(request, lines_json_df, engine): if engine == "pyarrow": + # GH 48893 reason = ( "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." @@ -177,6 +181,7 @@ def test_readjson_each_chunk(request, lines_json_df, engine): def test_readjson_chunks_from_file(request, engine): if engine == "pyarrow": + # GH 48893 reason = ( "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." @@ -259,6 +264,7 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): def test_readjson_unicode(request, monkeypatch, engine): if engine == "pyarrow": + # GH 48893 reason = ( "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." @@ -293,6 +299,7 @@ def test_readjson_nrows_chunks(request, nrows, chunksize, engine): # GH 33916 # Test reading line-format JSON to Series with nrows and chunksize param if engine == "pyarrow": + # GH 48893 reason = ( "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." @@ -327,6 +334,7 @@ def test_readjson_lines_chunks_fileurl(request, datapath, engine): # GH 27135 # Test reading line-format JSON from file url if engine == "pyarrow": + # GH 48893 reason = ( "Pyarrow only supports a file path as an input and line delimited json" "and doesn't support chunksize parameter." From 9cbf598a58aa64b9a36135313c2a14efe4be1922 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Feb 2023 15:25:58 -0800 Subject: [PATCH 15/16] Add note about param --- doc/source/user_guide/io.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d654b1aed6836..d0cd5c300248b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2069,6 +2069,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` * ``lines`` : reads file as one json object per line. * ``encoding`` : The encoding to use to decode py3 bytes. * ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration. +* ``engine``: Either ``"ujson"``, the built-in JSON parser, or ``"pyarrow"`` which dispatches to pyarrow's ``pyarrow.json.read_json``. + The ``"pyarrow"`` is only available when ``lines=True`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. From c59310bd7e1e60a7a279c27e35f00d5152475922 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Feb 2023 18:13:02 -0800 Subject: [PATCH 16/16] Add test with lines=false --- pandas/tests/io/json/test_pandas.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3e361ad6852f9..db839353934b0 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1964,3 +1964,11 @@ def test_invalid_engine(): out = ser.to_json() with pytest.raises(ValueError, match="The engine type foo"): read_json(out, engine="foo") + + +def test_pyarrow_engine_lines_false(): + # GH 48893 + ser = Series(range(1)) + out = ser.to_json() + with pytest.raises(ValueError, match="currently pyarrow engine only supports"): + read_json(out, engine="pyarrow", lines=False)