Skip to content

Commit 87a2b9a

Browse files
committed
added ArrowJsonParser and tests
1 parent edbac36 commit 87a2b9a

File tree

5 files changed

+210
-15
lines changed

5 files changed

+210
-15
lines changed

pandas/_typing.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,9 @@ def closed(self) -> bool:
319319
# read_csv engines
320320
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
321321

322+
# read_json engines
323+
JSONEngine = Literal["ujson", "pyarrow"]
324+
322325
# read_xml parsers
323326
XMLParsers = Literal["lxml", "etree"]
324327

pandas/io/json/_json.py

Lines changed: 67 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
DtypeArg,
2929
FilePath,
3030
IndexLabel,
31+
JSONEngine,
3132
JSONSerializable,
3233
ReadBuffer,
3334
StorageOptions,
@@ -70,6 +71,7 @@
7071
build_table_schema,
7172
parse_table_schema,
7273
)
74+
from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper
7375
from pandas.io.parsers.readers import validate_integer
7476

7577
if TYPE_CHECKING:
@@ -394,6 +396,7 @@ def read_json(
394396
date_unit: str | None = ...,
395397
encoding: str | None = ...,
396398
encoding_errors: str | None = ...,
399+
engine: JSONEngine = ...,
397400
lines: bool = ...,
398401
chunksize: int,
399402
compression: CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417420
date_unit: str | None = ...,
418421
encoding: str | None = ...,
419422
encoding_errors: str | None = ...,
423+
engine: JSONEngine = ...,
420424
lines: bool = ...,
421425
chunksize: int,
422426
compression: CompressionOptions = ...,
@@ -440,6 +444,7 @@ def read_json(
440444
date_unit: str | None = ...,
441445
encoding: str | None = ...,
442446
encoding_errors: str | None = ...,
447+
engine: JSONEngine = ...,
443448
lines: bool = ...,
444449
chunksize: None = ...,
445450
compression: CompressionOptions = ...,
@@ -462,6 +467,7 @@ def read_json(
462467
date_unit: str | None = ...,
463468
encoding: str | None = ...,
464469
encoding_errors: str | None = ...,
470+
engine: JSONEngine = ...,
465471
lines: bool = ...,
466472
chunksize: None = ...,
467473
compression: CompressionOptions = ...,
@@ -488,6 +494,7 @@ def read_json(
488494
date_unit: str | None = None,
489495
encoding: str | None = None,
490496
encoding_errors: str | None = "strict",
497+
engine: JSONEngine = "ujson",
491498
lines: bool = False,
492499
chunksize: int | None = None,
493500
compression: CompressionOptions = "infer",
@@ -609,6 +616,9 @@ def read_json(
609616
610617
.. versionadded:: 1.3.0
611618
619+
engine : {{'ujson', 'pyarrow'}}, default "ujson"
620+
Parser engine to use.
621+
612622
lines : bool, default False
613623
Read the file as a json object per line.
614624
@@ -744,6 +754,7 @@ def read_json(
744754
precise_float=precise_float,
745755
date_unit=date_unit,
746756
encoding=encoding,
757+
engine=engine,
747758
lines=lines,
748759
chunksize=chunksize,
749760
compression=compression,
@@ -786,6 +797,7 @@ def __init__(
786797
nrows: int | None,
787798
storage_options: StorageOptions = None,
788799
encoding_errors: str | None = "strict",
800+
engine: JSONEngine = "ujson",
789801
) -> None:
790802

791803
self.orient = orient
@@ -797,6 +809,7 @@ def __init__(
797809
self.precise_float = precise_float
798810
self.date_unit = date_unit
799811
self.encoding = encoding
812+
self.engine = engine
800813
self.compression = compression
801814
self.storage_options = storage_options
802815
self.lines = lines
@@ -814,9 +827,45 @@ def __init__(
814827
self.nrows = validate_integer("nrows", self.nrows, 0)
815828
if not self.lines:
816829
raise ValueError("nrows can only be passed if lines=True")
830+
if self.engine == "pyarrow":
831+
if not self.lines:
832+
raise ValueError(
833+
"currently pyarrow engine only supports "
834+
"the line-delimited JSON format"
835+
)
817836

818-
data = self._get_data_from_filepath(filepath_or_buffer)
819-
self.data = self._preprocess_data(data)
837+
if self.engine == "pyarrow":
838+
self._engine = self._make_engine(filepath_or_buffer, self.engine)
839+
if self.engine == "ujson":
840+
data = self._get_data_from_filepath(filepath_or_buffer)
841+
self.data = self._preprocess_data(data)
842+
843+
def _make_engine(
844+
self,
845+
filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
846+
engine: JSONEngine = "pyarrow",
847+
) -> ArrowJsonParserWrapper:
848+
849+
if not isinstance(filepath_or_buffer, list):
850+
is_text = False
851+
mode = "rb"
852+
self.handles = get_handle(
853+
filepath_or_buffer,
854+
mode=mode,
855+
encoding=self.encoding,
856+
is_text=is_text,
857+
compression=self.compression,
858+
storage_options=self.storage_options,
859+
errors=self.encoding_errors,
860+
)
861+
filepath_or_buffer = self.handles.handle
862+
863+
try:
864+
return ArrowJsonParserWrapper(filepath_or_buffer)
865+
except Exception:
866+
if self.handles is not None:
867+
self.handles.close()
868+
raise
820869

821870
def _preprocess_data(self, data):
822871
"""
@@ -900,20 +949,23 @@ def read(self) -> DataFrame | Series:
900949
Read the whole JSON input into a pandas object.
901950
"""
902951
obj: DataFrame | Series
903-
if self.lines:
904-
if self.chunksize:
905-
obj = concat(self)
906-
elif self.nrows:
907-
lines = list(islice(self.data, self.nrows))
908-
lines_json = self._combine_lines(lines)
909-
obj = self._get_object_parser(lines_json)
952+
if self.engine == "pyarrow":
953+
obj = self._engine.read()
954+
if self.engine == "ujson":
955+
if self.lines:
956+
if self.chunksize:
957+
obj = concat(self)
958+
elif self.nrows:
959+
lines = list(islice(self.data, self.nrows))
960+
lines_json = self._combine_lines(lines)
961+
obj = self._get_object_parser(lines_json)
962+
else:
963+
data = ensure_str(self.data)
964+
data_lines = data.split("\n")
965+
obj = self._get_object_parser(self._combine_lines(data_lines))
910966
else:
911-
data = ensure_str(self.data)
912-
data_lines = data.split("\n")
913-
obj = self._get_object_parser(self._combine_lines(data_lines))
914-
else:
915-
obj = self._get_object_parser(self.data)
916-
self.close()
967+
obj = self._get_object_parser(self.data)
968+
self.close()
917969
return obj
918970

919971
def _get_object_parser(self, json) -> DataFrame | Series:
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from pandas._typing import ReadBuffer
6+
from pandas.compat._optional import import_optional_dependency
7+
8+
from pandas.core.dtypes.inference import is_integer
9+
10+
if TYPE_CHECKING:
11+
from pandas import DataFrame
12+
13+
14+
class ArrowJsonParserWrapper:
15+
"""
16+
Wrapper for the pyarrow engine for read_json()
17+
"""
18+
19+
def __init__(self, src: ReadBuffer[bytes]) -> None:
20+
super().__init__()
21+
self.src = src
22+
23+
def _parse_kwd(self) -> None:
24+
"""
25+
Validates keywords before passing to pyarrow
26+
"""
27+
...
28+
29+
def _get_pyarrow_options(self) -> None:
30+
...
31+
32+
def read(self) -> DataFrame:
33+
"""
34+
Reads the contents of a JSON file into a DataFrame and
35+
processes it according to the kwargs passed in the
36+
constructor.
37+
38+
Returns
39+
-------
40+
DataFrame
41+
The DataFrame created from the JSON file.
42+
"""
43+
pyarrow_json = import_optional_dependency("pyarrow.json")
44+
table = pyarrow_json.read_json(self.src)
45+
46+
frame = table.to_pandas()
47+
return frame
48+
49+
def _finalize_output(self, frame: DataFrame) -> DataFrame:
50+
"""
51+
Processes data read in based on kwargs.
52+
53+
Parameters
54+
----------
55+
frame: DataFrame
56+
The DataFrame to process.
57+
58+
Returns
59+
-------
60+
DataFrame
61+
The processed DataFrame.
62+
"""
63+
num_cols = len(frame.columns)
64+
multi_index_named = True
65+
if self.header is None:
66+
if self.names is None:
67+
if self.prefix is not None:
68+
self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
69+
elif self.header is None:
70+
self.names = range(num_cols)
71+
if len(self.names) != num_cols:
72+
# usecols is passed through to pyarrow, we only handle index col here
73+
# The only way self.names is not the same length as number of cols is
74+
# if we have int index_col. We should just pad the names(they will get
75+
# removed anyways) to expected length then.
76+
self.names = list(range(num_cols - len(self.names))) + self.names
77+
multi_index_named = False
78+
frame.columns = self.names
79+
# we only need the frame not the names
80+
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
81+
if self.index_col is not None:
82+
for i, item in enumerate(self.index_col):
83+
if is_integer(item):
84+
self.index_col[i] = frame.columns[item]
85+
else:
86+
# String case
87+
if item not in frame.columns:
88+
raise ValueError(f"Index {item} invalid")
89+
frame.set_index(self.index_col, drop=True, inplace=True)
90+
# Clear names if headerless and no name given
91+
if self.header is None and not multi_index_named:
92+
frame.index.names = [None] * len(frame.index.names)
93+
94+
if self.kwds.get("dtype") is not None:
95+
try:
96+
frame = frame.astype(self.kwds.get("dtype"))
97+
except TypeError as e:
98+
# GH#44901 reraise to keep api consistent
99+
raise ValueError(e)
100+
return frame

pandas/tests/io/json/conftest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,11 @@ def orient(request):
77
Fixture for orients excluding the table format.
88
"""
99
return request.param
10+
11+
12+
@pytest.fixture
13+
def json_dir_path(datapath):
14+
"""
15+
The directory path to the data files needed for parser tests.
16+
"""
17+
return datapath("io", "json", "data")

pandas/tests/io/json/test_readlines.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from io import StringIO
2+
import os
23
from pathlib import Path
34
from typing import Iterator
45

@@ -27,6 +28,37 @@ def test_read_jsonl():
2728
tm.assert_frame_equal(result, expected)
2829

2930

31+
def test_read_jsonl_engine_pyarrow(json_dir_path):
32+
# '{"a": 1, "b": 2}\n{"a": 3, "b": 4}\n{"a": 5, "b": 6}'
33+
34+
result = read_json(
35+
os.path.join(json_dir_path, "line_delimited.json"),
36+
lines=True,
37+
engine="pyarrow",
38+
)
39+
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
40+
tm.assert_frame_equal(result, expected)
41+
42+
43+
@pytest.mark.xfail
44+
def test_read_jsonl_engine_pyarrow_lines_false(json_dir_path):
45+
result = read_json(
46+
os.path.join(json_dir_path, "line_delimited.json"),
47+
engine="pyarrow",
48+
)
49+
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
50+
tm.assert_frame_equal(result, expected)
51+
52+
53+
@pytest.mark.xfail
54+
def test_read_jsonl_engine_pyarrow_json_string():
55+
result = read_json(
56+
'{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}', engine="pyarrow"
57+
)
58+
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
59+
tm.assert_frame_equal(result, expected)
60+
61+
3062
def test_read_datetime():
3163
# GH33787
3264
df = DataFrame(

0 commit comments

Comments
 (0)