diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b22590759ea3f..02e290ea28051 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -41,6 +41,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_excel` * :func:`read_html` * :func:`read_xml` +* :func:`read_json` * :func:`read_sql` * :func:`read_sql_query` * :func:`read_sql_table` @@ -55,6 +56,7 @@ to select the nullable dtypes implementation. * :func:`read_excel` * :func:`read_html` * :func:`read_xml` +* :func:`read_json` * :func:`read_parquet` * :func:`read_orc` diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c501cad721ef5..aa1342d0f135f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -43,6 +43,7 @@ ensure_str, is_period_dtype, ) +from pandas.core.dtypes.generic import ABCIndex from pandas import ( DataFrame, @@ -396,6 +397,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> JsonReader[Literal["frame"]]: ... @@ -419,6 +421,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> JsonReader[Literal["series"]]: ... @@ -442,6 +445,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> Series: ... @@ -465,6 +469,7 @@ def read_json( compression: CompressionOptions = ..., nrows: int | None = ..., storage_options: StorageOptions = ..., + use_nullable_dtypes: bool = ..., ) -> DataFrame: ... @@ -491,6 +496,7 @@ def read_json( compression: CompressionOptions = "infer", nrows: int | None = None, storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame | Series | JsonReader: """ Convert a JSON string to pandas object. @@ -629,6 +635,19 @@ def read_json( .. versionadded:: 1.2.0 + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0 + Returns ------- Series or DataFrame @@ -740,6 +759,7 @@ def read_json( nrows=nrows, storage_options=storage_options, encoding_errors=encoding_errors, + use_nullable_dtypes=use_nullable_dtypes, ) if chunksize: @@ -775,6 +795,7 @@ def __init__( nrows: int | None, storage_options: StorageOptions = None, encoding_errors: str | None = "strict", + use_nullable_dtypes: bool = False, ) -> None: self.orient = orient @@ -794,6 +815,7 @@ def __init__( self.nrows = nrows self.encoding_errors = encoding_errors self.handles: IOHandles[str] | None = None + self.use_nullable_dtypes = use_nullable_dtypes if self.chunksize is not None: self.chunksize = validate_integer("chunksize", self.chunksize, 1) @@ -903,7 +925,10 @@ def read(self) -> DataFrame | Series: obj = self._get_object_parser(self._combine_lines(data_lines)) else: obj = self._get_object_parser(self.data) - return obj + if self.use_nullable_dtypes: + return obj.convert_dtypes(infer_objects=False) + else: + return obj def _get_object_parser(self, json) -> DataFrame | Series: """ @@ -919,6 +944,7 @@ def _get_object_parser(self, json) -> DataFrame | Series: "keep_default_dates": self.keep_default_dates, "precise_float": self.precise_float, "date_unit": self.date_unit, + "use_nullable_dtypes": self.use_nullable_dtypes, } obj = None if typ == "frame": @@ -977,7 +1003,10 @@ def __next__(self) -> DataFrame | Series: self.close() raise ex - return obj + if self.use_nullable_dtypes: + return obj.convert_dtypes(infer_objects=False) + else: + return obj def __enter__(self) -> JsonReader[FrameSeriesStrT]: return self @@ -1013,6 +1042,7 @@ def __init__( keep_default_dates: bool = False, precise_float: bool = False, date_unit=None, + use_nullable_dtypes: bool = False, ) -> None: self.json = json @@ -1037,6 +1067,7 @@ def __init__( self.date_unit = date_unit self.keep_default_dates = keep_default_dates self.obj: DataFrame | Series | None = None + self.use_nullable_dtypes = use_nullable_dtypes def check_keys_split(self, decoded) -> None: """ @@ -1119,7 +1150,10 @@ def _try_convert_data( if result: return new_data, True - if data.dtype == "object": + if self.use_nullable_dtypes and not isinstance(data, ABCIndex): + # Fall through for conversion later on + return data, True + elif data.dtype == "object": # try float try: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d1de676a4eb2e..f675c34e2779a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -15,6 +15,7 @@ import pandas as pd from pandas import ( + NA, DataFrame, DatetimeIndex, Series, @@ -22,6 +23,10 @@ read_json, ) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) def assert_json_roundtrip_equal(result, expected, orient): @@ -1866,3 +1871,88 @@ def test_json_uint64(self): df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]}) result = df.to_json(orient="split") assert result == expected + + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) + @pytest.mark.parametrize( + "orient", ["split", "records", "values", "index", "columns"] + ) + def test_read_json_nullable(self, string_storage, dtype_backend, orient): + # GH#50750 + pa = pytest.importorskip("pyarrow") + df = DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": [True, False, None], + "f": [True, False, True], + "g": ["a", "b", "c"], + "h": ["a", "b", None], + } + ) + + if string_storage == "python": + string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) + string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_)) + + else: + string_array = ArrowStringArray(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArray(pa.array(["a", "b", None])) + + out = df.to_json(orient=orient) + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_json(out, use_nullable_dtypes=True, orient=orient) + + expected = DataFrame( + { + "a": Series([1, np.nan, 3], dtype="Int64"), + "b": Series([1, 2, 3], dtype="Int64"), + "c": Series([1.5, np.nan, 2.5], dtype="Float64"), + "d": Series([1.5, 2.0, 2.5], dtype="Float64"), + "e": Series([True, False, NA], dtype="boolean"), + "f": Series([True, False, True], dtype="boolean"), + "g": string_array, + "h": string_array_na, + } + ) + + if dtype_backend == "pyarrow": + + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + + if orient == "values": + expected.columns = list(range(0, 8)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) + @pytest.mark.parametrize("orient", ["split", "records", "index"]) + def test_read_json_nullable_series(self, string_storage, dtype_backend, orient): + # GH#50750 + pa = pytest.importorskip("pyarrow") + ser = Series([1, np.nan, 3], dtype="Int64") + + out = ser.to_json(orient=orient) + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_json( + out, use_nullable_dtypes=True, orient=orient, typ="series" + ) + + expected = Series([1, np.nan, 3], dtype="Int64") + + if dtype_backend == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True))) + + tm.assert_series_equal(result, expected)