28
28
DtypeArg ,
29
29
FilePath ,
30
30
IndexLabel ,
31
+ JSONEngine ,
31
32
JSONSerializable ,
32
33
ReadBuffer ,
33
34
StorageOptions ,
70
71
build_table_schema ,
71
72
parse_table_schema ,
72
73
)
74
+ from pandas .io .json .arrow_json_parser_wrapper import ArrowJsonParserWrapper
73
75
from pandas .io .parsers .readers import validate_integer
74
76
75
77
if TYPE_CHECKING :
@@ -394,6 +396,7 @@ def read_json(
394
396
date_unit : str | None = ...,
395
397
encoding : str | None = ...,
396
398
encoding_errors : str | None = ...,
399
+ engine : JSONEngine = ...,
397
400
lines : bool = ...,
398
401
chunksize : int ,
399
402
compression : CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417
420
date_unit : str | None = ...,
418
421
encoding : str | None = ...,
419
422
encoding_errors : str | None = ...,
423
+ engine : JSONEngine = ...,
420
424
lines : bool = ...,
421
425
chunksize : int ,
422
426
compression : CompressionOptions = ...,
@@ -440,6 +444,7 @@ def read_json(
440
444
date_unit : str | None = ...,
441
445
encoding : str | None = ...,
442
446
encoding_errors : str | None = ...,
447
+ engine : JSONEngine = ...,
443
448
lines : bool = ...,
444
449
chunksize : None = ...,
445
450
compression : CompressionOptions = ...,
@@ -462,6 +467,7 @@ def read_json(
462
467
date_unit : str | None = ...,
463
468
encoding : str | None = ...,
464
469
encoding_errors : str | None = ...,
470
+ engine : JSONEngine = ...,
465
471
lines : bool = ...,
466
472
chunksize : None = ...,
467
473
compression : CompressionOptions = ...,
@@ -488,6 +494,7 @@ def read_json(
488
494
date_unit : str | None = None ,
489
495
encoding : str | None = None ,
490
496
encoding_errors : str | None = "strict" ,
497
+ engine : JSONEngine = "ujson" ,
491
498
lines : bool = False ,
492
499
chunksize : int | None = None ,
493
500
compression : CompressionOptions = "infer" ,
@@ -609,6 +616,9 @@ def read_json(
609
616
610
617
.. versionadded:: 1.3.0
611
618
619
+ engine : {{'ujson', 'pyarrow'}}, default "ujson"
620
+ Parser engine to use.
621
+
612
622
lines : bool, default False
613
623
Read the file as a json object per line.
614
624
@@ -744,6 +754,7 @@ def read_json(
744
754
precise_float = precise_float ,
745
755
date_unit = date_unit ,
746
756
encoding = encoding ,
757
+ engine = engine ,
747
758
lines = lines ,
748
759
chunksize = chunksize ,
749
760
compression = compression ,
@@ -786,6 +797,7 @@ def __init__(
786
797
nrows : int | None ,
787
798
storage_options : StorageOptions = None ,
788
799
encoding_errors : str | None = "strict" ,
800
+ engine : JSONEngine = "ujson" ,
789
801
) -> None :
790
802
791
803
self .orient = orient
@@ -797,6 +809,7 @@ def __init__(
797
809
self .precise_float = precise_float
798
810
self .date_unit = date_unit
799
811
self .encoding = encoding
812
+ self .engine = engine
800
813
self .compression = compression
801
814
self .storage_options = storage_options
802
815
self .lines = lines
@@ -814,9 +827,45 @@ def __init__(
814
827
self .nrows = validate_integer ("nrows" , self .nrows , 0 )
815
828
if not self .lines :
816
829
raise ValueError ("nrows can only be passed if lines=True" )
830
+ if self .engine == "pyarrow" :
831
+ if not self .lines :
832
+ raise ValueError (
833
+ "currently pyarrow engine only supports "
834
+ "the line-delimited JSON format"
835
+ )
817
836
818
- data = self ._get_data_from_filepath (filepath_or_buffer )
819
- self .data = self ._preprocess_data (data )
837
+ if self .engine == "pyarrow" :
838
+ self ._engine = self ._make_engine (filepath_or_buffer , self .engine )
839
+ if self .engine == "ujson" :
840
+ data = self ._get_data_from_filepath (filepath_or_buffer )
841
+ self .data = self ._preprocess_data (data )
842
+
843
+ def _make_engine (
844
+ self ,
845
+ filepath_or_buffer : FilePath | ReadBuffer [str ] | ReadBuffer [bytes ],
846
+ engine : JSONEngine = "pyarrow" ,
847
+ ) -> ArrowJsonParserWrapper :
848
+
849
+ if not isinstance (filepath_or_buffer , list ):
850
+ is_text = False
851
+ mode = "rb"
852
+ self .handles = get_handle (
853
+ filepath_or_buffer ,
854
+ mode = mode ,
855
+ encoding = self .encoding ,
856
+ is_text = is_text ,
857
+ compression = self .compression ,
858
+ storage_options = self .storage_options ,
859
+ errors = self .encoding_errors ,
860
+ )
861
+ filepath_or_buffer = self .handles .handle
862
+
863
+ try :
864
+ return ArrowJsonParserWrapper (filepath_or_buffer )
865
+ except Exception :
866
+ if self .handles is not None :
867
+ self .handles .close ()
868
+ raise
820
869
821
870
def _preprocess_data (self , data ):
822
871
"""
@@ -900,20 +949,23 @@ def read(self) -> DataFrame | Series:
900
949
Read the whole JSON input into a pandas object.
901
950
"""
902
951
obj : DataFrame | Series
903
- if self .lines :
904
- if self .chunksize :
905
- obj = concat (self )
906
- elif self .nrows :
907
- lines = list (islice (self .data , self .nrows ))
908
- lines_json = self ._combine_lines (lines )
909
- obj = self ._get_object_parser (lines_json )
952
+ if self .engine == "pyarrow" :
953
+ obj = self ._engine .read ()
954
+ if self .engine == "ujson" :
955
+ if self .lines :
956
+ if self .chunksize :
957
+ obj = concat (self )
958
+ elif self .nrows :
959
+ lines = list (islice (self .data , self .nrows ))
960
+ lines_json = self ._combine_lines (lines )
961
+ obj = self ._get_object_parser (lines_json )
962
+ else :
963
+ data = ensure_str (self .data )
964
+ data_lines = data .split ("\n " )
965
+ obj = self ._get_object_parser (self ._combine_lines (data_lines ))
910
966
else :
911
- data = ensure_str (self .data )
912
- data_lines = data .split ("\n " )
913
- obj = self ._get_object_parser (self ._combine_lines (data_lines ))
914
- else :
915
- obj = self ._get_object_parser (self .data )
916
- self .close ()
967
+ obj = self ._get_object_parser (self .data )
968
+ self .close ()
917
969
return obj
918
970
919
971
def _get_object_parser (self , json ) -> DataFrame | Series :
0 commit comments