diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 63e6b007f77a8..1d56b4fbd48b5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1918,6 +1918,7 @@ with optional parameters: * ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. * ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. * ``lines`` : If ``records`` orient, then will write each record per line as json. +* ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w' Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e57ba92267855..366e1f77d7caf 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -37,6 +37,7 @@ Other enhancements - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) +- :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4f3c908817ac7..d3239db116b40 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2412,6 +2412,7 @@ def to_json( index: bool_t = True, indent: int | None = None, storage_options: StorageOptions = None, + mode: Literal["a", "w"] = "w", ) -> str | None: """ Convert the object to a JSON string. @@ -2490,6 +2491,11 @@ def to_json( .. versionadded:: 1.2.0 + mode : str, default 'w' (writing) + Specify the IO mode for output when supplying a path_or_buf. + Accepted args are 'w' (writing) and 'a' (append) only. + mode='a' is only supported when lines is True and orient is 'records'. + Returns ------- None or str @@ -2673,6 +2679,7 @@ def to_json( index=index, indent=indent, storage_options=storage_options, + mode=mode, ) @final diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 9b8364c449e36..5231fb2d39ca3 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -99,6 +99,7 @@ def to_json( index: bool = ..., indent: int = ..., storage_options: StorageOptions = ..., + mode: Literal["a", "w"] = ..., ) -> None: ... @@ -118,6 +119,7 @@ def to_json( index: bool = ..., indent: int = ..., storage_options: StorageOptions = ..., + mode: Literal["a", "w"] = ..., ) -> str: ... @@ -136,6 +138,7 @@ def to_json( index: bool = True, indent: int = 0, storage_options: StorageOptions = None, + mode: Literal["a", "w"] = "w", ) -> str | None: if not index and orient not in ["split", "table"]: @@ -146,6 +149,20 @@ def to_json( if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") + if mode not in ["a", "w"]: + msg = ( + f"mode={mode} is not a valid option." + "Only 'w' and 'a' are currently supported." + ) + raise ValueError(msg) + + if mode == "a" and (not lines or orient != "records"): + msg = ( + "mode='a' (append) is only supported when" + "lines is True and orient is 'records'" + ) + raise ValueError(msg) + if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") @@ -177,7 +194,7 @@ def to_json( if path_or_buf is not None: # apply compression and byte/text conversion with get_handle( - path_or_buf, "w", compression=compression, storage_options=storage_options + path_or_buf, mode, compression=compression, storage_options=storage_options ) as handles: handles.handle.write(s) else: diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index b371990178d28..4b815203e2651 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -297,3 +297,142 @@ def __iter__(self) -> Iterator: reader = MyReader(jsonl) assert len(list(read_json(reader, lines=True, chunksize=100))) > 1 assert reader.read_count > 10 + + +@pytest.mark.parametrize("orient_", ["split", "index", "table"]) +def test_to_json_append_orient(orient_): + # GH 35849 + # Test ValueError when orient is not 'records' + df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + msg = ( + r"mode='a' \(append\) is only supported when" + "lines is True and orient is 'records'" + ) + with pytest.raises(ValueError, match=msg): + df.to_json(mode="a", orient=orient_) + + +def test_to_json_append_lines(): + # GH 35849 + # Test ValueError when lines is not True + df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + msg = ( + r"mode='a' \(append\) is only supported when" + "lines is True and orient is 'records'" + ) + with pytest.raises(ValueError, match=msg): + df.to_json(mode="a", lines=False, orient="records") + + +@pytest.mark.parametrize("mode_", ["r", "x"]) +def test_to_json_append_mode(mode_): + # GH 35849 + # Test ValueError when mode is not supported option + df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + msg = ( + f"mode={mode_} is not a valid option." + "Only 'w' and 'a' are currently supported." + ) + with pytest.raises(ValueError, match=msg): + df.to_json(mode=mode_, lines=False, orient="records") + + +def to_json_append_output_consistent_columns(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing same columns, new rows + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) + + expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df1.to_json(path, lines=True, orient="records") + df2.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected) + + +def to_json_append_output_inconsistent_columns(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing one new column, one old column, new rows + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) + + expected = DataFrame( + { + "col1": [1, 2, None, None], + "col2": ["a", "b", "e", "f"], + "col3": [None, None, "!", "#"], + } + ) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df1.to_json(path, mode="a", lines=True, orient="records") + df3.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected) + + +def to_json_append_output_different_columns(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing same, differing and new columns + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) + df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) + df4 = DataFrame({"col4": [True, False]}) + + expected = DataFrame( + { + "col1": [1, 2, 3, 4, None, None, None, None], + "col2": ["a", "b", "c", "d", "e", "f", None, None], + "col3": [None, None, None, None, "!", "#", None, None], + "col4": [None, None, None, None, None, None, True, False], + } + ) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df1.to_json(path, mode="a", lines=True, orient="records") + df2.to_json(path, mode="a", lines=True, orient="records") + df3.to_json(path, mode="a", lines=True, orient="records") + df4.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected) + + +def to_json_append_output_different_columns_reordered(): + # GH 35849 + # Testing that resulting output reads in as expected. + # Testing specific result column order. + df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]}) + df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]}) + df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]}) + df4 = DataFrame({"col4": [True, False]}) + + # df4, df3, df2, df1 (in that order) + expected = DataFrame( + { + "col4": [True, False, None, None, None, None, None, None], + "col2": [None, None, "e", "f", "c", "d", "a", "b"], + "col3": [None, None, "!", "#", None, None, None, None], + "col1": [None, None, None, None, 3, 4, 1, 2], + } + ) + with tm.ensure_clean("test.json") as path: + # Save dataframes to the same file + df4.to_json(path, mode="a", lines=True, orient="records") + df3.to_json(path, mode="a", lines=True, orient="records") + df2.to_json(path, mode="a", lines=True, orient="records") + df1.to_json(path, mode="a", lines=True, orient="records") + + # Read path file + result = read_json(path, lines=True) + tm.assert_frame_equal(result, expected)