diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1c3cdd9f4cffd..d0cd5c300248b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2069,6 +2069,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` * ``lines`` : reads file as one json object per line. * ``encoding`` : The encoding to use to decode py3 bytes. * ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration. +* ``engine``: Either ``"ujson"``, the built-in JSON parser, or ``"pyarrow"`` which dispatches to pyarrow's ``pyarrow.json.read_json``. + The ``"pyarrow"`` is only available when ``lines=True`` The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -2250,6 +2252,16 @@ For line-delimited json files, pandas can also return an iterator which reads in for chunk in reader: print(chunk) +Line-limited json can also be read using the pyarrow reader by specifying ``engine="pyarrow"``. + +.. ipython:: python + + from io import BytesIO + df = pd.read_json(BytesIO(jsonl.encode()), lines=True, engine="pyarrow") + df + +.. versionadded:: 2.0.0 + .. _io.table_schema: Table schema diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b006d3820889f..51b4d0468a19a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -298,6 +298,7 @@ Other enhancements - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`) - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) +- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) - Added support for SQLAlchemy 2.0 (:issue:`40686`) - diff --git a/pandas/_typing.py b/pandas/_typing.py index 8d3044a978291..87979aba9ada4 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -324,6 +324,9 @@ def closed(self) -> bool: # read_csv engines CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"] +# read_json engines +JSONEngine = Literal["ujson", "pyarrow"] + # read_xml parsers XMLParsers = Literal["lxml", "etree"] diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 338e831ed184f..1494b11125319 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -21,7 +21,10 @@ import numpy as np -from pandas._config import using_nullable_dtypes +from pandas._config import ( + get_option, + using_nullable_dtypes, +) from pandas._libs import lib from pandas._libs.json import ( @@ -34,11 +37,13 @@ DtypeArg, FilePath, IndexLabel, + JSONEngine, JSONSerializable, ReadBuffer, StorageOptions, WriteBuffer, ) +from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc @@ -401,6 +406,7 @@ def read_json( nrows: int | None = ..., storage_options: StorageOptions = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> JsonReader[Literal["frame"]]: ... @@ -425,6 +431,7 @@ def read_json( nrows: int | None = ..., storage_options: StorageOptions = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> JsonReader[Literal["series"]]: ... @@ -449,6 +456,7 @@ def read_json( nrows: int | None = ..., storage_options: StorageOptions = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> Series: ... @@ -473,6 +481,7 @@ def read_json( nrows: int | None = ..., storage_options: StorageOptions = ..., use_nullable_dtypes: bool = ..., + engine: JSONEngine = ..., ) -> DataFrame: ... @@ -500,6 +509,7 @@ def read_json( nrows: int | None = None, storage_options: StorageOptions = None, use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + engine: JSONEngine = "ujson", ) -> DataFrame | Series | JsonReader: """ Convert a JSON string to pandas object. @@ -653,6 +663,12 @@ def read_json( .. versionadded:: 2.0 + engine : {{"ujson", "pyarrow"}}, default "ujson" + Parser engine to use. The ``"pyarrow"`` engine is only available when + ``lines=True``. + + .. versionadded:: 2.0 + Returns ------- Series or DataFrame @@ -771,6 +787,7 @@ def read_json( storage_options=storage_options, encoding_errors=encoding_errors, use_nullable_dtypes=use_nullable_dtypes, + engine=engine, ) if chunksize: @@ -807,6 +824,7 @@ def __init__( storage_options: StorageOptions = None, encoding_errors: str | None = "strict", use_nullable_dtypes: bool = False, + engine: JSONEngine = "ujson", ) -> None: self.orient = orient @@ -818,6 +836,7 @@ def __init__( self.precise_float = precise_float self.date_unit = date_unit self.encoding = encoding + self.engine = engine self.compression = compression self.storage_options = storage_options self.lines = lines @@ -828,17 +847,32 @@ def __init__( self.handles: IOHandles[str] | None = None self.use_nullable_dtypes = use_nullable_dtypes + if self.engine not in {"pyarrow", "ujson"}: + raise ValueError( + f"The engine type {self.engine} is currently not supported." + ) if self.chunksize is not None: self.chunksize = validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") + if self.engine == "pyarrow": + raise ValueError( + "currently pyarrow engine doesn't support chunksize parameter" + ) if self.nrows is not None: self.nrows = validate_integer("nrows", self.nrows, 0) if not self.lines: raise ValueError("nrows can only be passed if lines=True") - - data = self._get_data_from_filepath(filepath_or_buffer) - self.data = self._preprocess_data(data) + if self.engine == "pyarrow": + if not self.lines: + raise ValueError( + "currently pyarrow engine only supports " + "the line-delimited JSON format" + ) + self.data = filepath_or_buffer + elif self.engine == "ujson": + data = self._get_data_from_filepath(filepath_or_buffer) + self.data = self._preprocess_data(data) def _preprocess_data(self, data): """ @@ -923,23 +957,45 @@ def read(self) -> DataFrame | Series: """ obj: DataFrame | Series with self: - if self.lines: - if self.chunksize: - obj = concat(self) - elif self.nrows: - lines = list(islice(self.data, self.nrows)) - lines_json = self._combine_lines(lines) - obj = self._get_object_parser(lines_json) + if self.engine == "pyarrow": + pyarrow_json = import_optional_dependency("pyarrow.json") + pa_table = pyarrow_json.read_json(self.data) + if self.use_nullable_dtypes: + if get_option("mode.dtype_backend") == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + return DataFrame( + { + col_name: ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } + ) + elif get_option("mode.dtype_backend") == "pandas": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() + return pa_table.to_pandas(types_mapper=mapping.get) + return pa_table.to_pandas() + elif self.engine == "ujson": + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data_lines = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data_lines)) else: - data = ensure_str(self.data) - data_lines = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data_lines)) - else: - obj = self._get_object_parser(self.data) - if self.use_nullable_dtypes: - return obj.convert_dtypes(infer_objects=False) - else: - return obj + obj = self._get_object_parser(self.data) + if self.use_nullable_dtypes: + return obj.convert_dtypes(infer_objects=False) + else: + return obj def _get_object_parser(self, json) -> DataFrame | Series: """ diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py index 4e848cd48b42d..f3736252e850a 100644 --- a/pandas/tests/io/json/conftest.py +++ b/pandas/tests/io/json/conftest.py @@ -7,3 +7,10 @@ def orient(request): Fixture for orients excluding the table format. """ return request.param + + +@pytest.fixture(params=["ujson", "pyarrow"]) +def engine(request): + if request.param == "pyarrow": + pytest.importorskip("pyarrow.json") + return request.param diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f59e1e8cbe43d..db839353934b0 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1956,3 +1956,19 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True))) tm.assert_series_equal(result, expected) + + +def test_invalid_engine(): + # GH 48893 + ser = Series(range(1)) + out = ser.to_json() + with pytest.raises(ValueError, match="The engine type foo"): + read_json(out, engine="foo") + + +def test_pyarrow_engine_lines_false(): + # GH 48893 + ser = Series(range(1)) + out = ser.to_json() + with pytest.raises(ValueError, match="currently pyarrow engine only supports"): + read_json(out, engine="pyarrow", lines=False) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index a76627fb08147..9b36423be73dd 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -27,14 +27,29 @@ def test_read_jsonl(): tm.assert_frame_equal(result, expected) -def test_read_datetime(): +def test_read_jsonl_engine_pyarrow(datapath, engine): + result = read_json( + datapath("io", "json", "data", "line_delimited.json"), + lines=True, + engine=engine, + ) + expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]}) + tm.assert_frame_equal(result, expected) + + +def test_read_datetime(request, engine): # GH33787 + if engine == "pyarrow": + # GH 48893 + reason = "Pyarrow only supports a file path as an input and line delimited json" + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + df = DataFrame( [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")], columns=["accounts", "date", "name"], ) json_line = df.to_json(lines=True, orient="records") - result = read_json(json_line) + result = read_json(json_line, engine=engine) expected = DataFrame( [[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]], columns=["accounts", "date", "name"], @@ -90,55 +105,95 @@ def test_to_jsonl_count_new_lines(): @pytest.mark.parametrize("chunksize", [1, 1.0]) -def test_readjson_chunks(lines_json_df, chunksize): +def test_readjson_chunks(request, lines_json_df, chunksize, engine): # Basic test that read_json(chunks=True) gives the same result as # read_json(chunks=False) # GH17048: memory usage when lines=True + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + unchunked = read_json(StringIO(lines_json_df), lines=True) - with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as reader: + with read_json( + StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine + ) as reader: chunked = pd.concat(reader) tm.assert_frame_equal(chunked, unchunked) -def test_readjson_chunksize_requires_lines(lines_json_df): +def test_readjson_chunksize_requires_lines(lines_json_df, engine): msg = "chunksize can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - with read_json(StringIO(lines_json_df), lines=False, chunksize=2) as _: + with read_json( + StringIO(lines_json_df), lines=False, chunksize=2, engine=engine + ) as _: pass -def test_readjson_chunks_series(): +def test_readjson_chunks_series(request, engine): + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason)) + # Test reading line-format JSON to Series with chunksize param s = pd.Series({"A": 1, "B": 2}) strio = StringIO(s.to_json(lines=True, orient="records")) - unchunked = read_json(strio, lines=True, typ="Series") + unchunked = read_json(strio, lines=True, typ="Series", engine=engine) strio = StringIO(s.to_json(lines=True, orient="records")) - with read_json(strio, lines=True, typ="Series", chunksize=1) as reader: + with read_json( + strio, lines=True, typ="Series", chunksize=1, engine=engine + ) as reader: chunked = pd.concat(reader) tm.assert_series_equal(chunked, unchunked) -def test_readjson_each_chunk(lines_json_df): +def test_readjson_each_chunk(request, lines_json_df, engine): + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + # Other tests check that the final result of read_json(chunksize=True) # is correct. This checks the intermediate chunks. - with read_json(StringIO(lines_json_df), lines=True, chunksize=2) as reader: + with read_json( + StringIO(lines_json_df), lines=True, chunksize=2, engine=engine + ) as reader: chunks = list(reader) assert chunks[0].shape == (2, 2) assert chunks[1].shape == (1, 2) -def test_readjson_chunks_from_file(): +def test_readjson_chunks_from_file(request, engine): + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + with tm.ensure_clean("test.json") as path: df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.to_json(path, lines=True, orient="records") - with read_json(path, lines=True, chunksize=1) as reader: + with read_json(path, lines=True, chunksize=1, engine=engine) as reader: chunked = pd.concat(reader) - unchunked = read_json(path, lines=True) + unchunked = read_json(path, lines=True, engine=engine) tm.assert_frame_equal(unchunked, chunked) @@ -171,11 +226,13 @@ def test_readjson_chunks_closes(chunksize): @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"]) -def test_readjson_invalid_chunksize(lines_json_df, chunksize): +def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine): msg = r"'chunksize' must be an integer >=1" with pytest.raises(ValueError, match=msg): - with read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) as _: + with read_json( + StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine + ) as _: pass @@ -205,19 +262,27 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}") -def test_readjson_unicode(monkeypatch): +def test_readjson_unicode(request, monkeypatch, engine): + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + with tm.ensure_clean("test.json") as path: monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949") with open(path, "w", encoding="utf-8") as f: f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}') - result = read_json(path) + result = read_json(path, engine=engine) expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("nrows", [1, 2]) -def test_readjson_nrows(nrows): +def test_readjson_nrows(nrows, engine): # GH 33916 # Test reading line-format JSON to Series with nrows param jsonl = """{"a": 1, "b": 2} @@ -230,20 +295,30 @@ def test_readjson_nrows(nrows): @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) -def test_readjson_nrows_chunks(nrows, chunksize): +def test_readjson_nrows_chunks(request, nrows, chunksize, engine): # GH 33916 # Test reading line-format JSON to Series with nrows and chunksize param + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - with read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) as reader: + with read_json( + jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine + ) as reader: chunked = pd.concat(reader) expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(chunked, expected) -def test_readjson_nrows_requires_lines(): +def test_readjson_nrows_requires_lines(engine): # GH 33916 # Test ValuError raised if nrows is set without setting lines in read_json jsonl = """{"a": 1, "b": 2} @@ -252,12 +327,20 @@ def test_readjson_nrows_requires_lines(): {"a": 7, "b": 8}""" msg = "nrows can only be passed if lines=True" with pytest.raises(ValueError, match=msg): - read_json(jsonl, lines=False, nrows=2) + read_json(jsonl, lines=False, nrows=2, engine=engine) -def test_readjson_lines_chunks_fileurl(datapath): +def test_readjson_lines_chunks_fileurl(request, datapath, engine): # GH 27135 # Test reading line-format JSON from file url + if engine == "pyarrow": + # GH 48893 + reason = ( + "Pyarrow only supports a file path as an input and line delimited json" + "and doesn't support chunksize parameter." + ) + request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) + df_list_expected = [ DataFrame([[1, 2]], columns=["a", "b"], index=[0]), DataFrame([[3, 4]], columns=["a", "b"], index=[1]), @@ -265,7 +348,7 @@ def test_readjson_lines_chunks_fileurl(datapath): ] os_path = datapath("io", "json", "data", "line_delimited.json") file_url = Path(os_path).as_uri() - with read_json(file_url, lines=True, chunksize=1) as url_reader: + with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader: for index, chuck in enumerate(url_reader): tm.assert_frame_equal(chuck, df_list_expected[index])