Merge pull request #1 from SFuller4/to-json-append-mode

SFuller4 · web-flow · commit b5d049cc734a · 2022-09-13T22:27:28.000-05:00
To json append mode
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1910,6 +1910,7 @@ with optional parameters:
 * ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'.
 * ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object.
 * ``lines`` : If ``records`` orient, then will write each record per line as json.
+* ``mode`` : string, writer mode when writing to path. 'w' for write, 'a' for append. Default 'w'
 
 Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters.
 
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -333,6 +333,7 @@ Other enhancements
 - :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`)
 - The method :meth:`.ExtensionArray.factorize` accepts ``use_na_sentinel=False`` for determining how null values are to be treated (:issue:`46601`)
 - The ``Dockerfile`` now installs a dedicated ``pandas-dev`` virtual environment for pandas development instead of using the ``base`` environment (:issue:`48427`)
+- :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2400,6 +2400,7 @@ def to_json(
         index: bool_t = True,
         indent: int | None = None,
         storage_options: StorageOptions = None,
+        mode: str = "w",
     ) -> str | None:
         """
         Convert the object to a JSON string.
@@ -2478,6 +2479,11 @@ def to_json(
 
             .. versionadded:: 1.2.0
 
+        mode : str, default 'w' (writing)
+            Specify the IO mode for output when supplying a path_or_buf.
+            Accepted args are 'w' (writing) and 'a' (append) only.
+            mode='a' is only supported when lines is True and orient is 'records'.
+
         Returns
         -------
         None or str
@@ -2661,6 +2667,7 @@ def to_json(
             index=index,
             indent=indent,
             storage_options=storage_options,
+            mode=mode,
         )
 
     @final
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -136,6 +136,7 @@ def to_json(
     index: bool = True,
     indent: int = 0,
     storage_options: StorageOptions = None,
+    mode: str = "w",
 ) -> str | None:
 
     if not index and orient not in ["split", "table"]:
@@ -146,6 +147,20 @@ def to_json(
     if lines and orient != "records":
         raise ValueError("'lines' keyword only valid when 'orient' is records")
 
+    if mode not in ["a", "w"]:
+        msg = (
+            f"mode={repr(mode)} is not a valid option."
+            "Only 'w' and 'a' are currently supported."
+        )
+        raise ValueError(msg)
+
+    if mode == "a" and (not lines or orient != "records"):
+        msg = (
+            "mode='a' (append) is only supported when"
+            "lines is True and orient is 'records'"
+        )
+        raise ValueError(msg)
+
     if orient == "table" and isinstance(obj, Series):
         obj = obj.to_frame(name=obj.name or "values")
 
@@ -177,7 +192,7 @@ def to_json(
     if path_or_buf is not None:
         # apply compression and byte/text conversion
         with get_handle(
-            path_or_buf, "w", compression=compression, storage_options=storage_options
+            path_or_buf, mode, compression=compression, storage_options=storage_options
         ) as handles:
             handles.handle.write(s)
     else:
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py
@@ -297,3 +297,101 @@ def __iter__(self) -> Iterator:
     reader = MyReader(jsonl)
     assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
     assert reader.read_count > 10
+
+
+@pytest.mark.parametrize("orient_", ["split", "index", "table"])
+def test_to_json_append_orient(orient_):
+    # GH 35849
+    # Test ValueError when orient is not 'records'
+    df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    msg = (
+        r"mode='a' \(append\) is only supported when"
+        "lines is True and orient is 'records'"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_json(mode="a", orient=orient_)
+
+
+def test_to_json_append_lines():
+    # GH 35849
+    # Test ValueError when lines is not True
+    df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    msg = (
+        r"mode='a' \(append\) is only supported when"
+        "lines is True and orient is 'records'"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_json(mode="a", lines=False, orient="records")
+
+
+@pytest.mark.parametrize("mode_", ["r", "x"])
+def test_to_json_append_mode(mode_):
+    # GH 35849
+    # Test ValueError when mode is not supported option
+    df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    msg = (
+        f"mode={repr(mode_)} is not a valid option."
+        "Only 'w' and 'a' are currently supported."
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_json(mode=mode_, lines=False, orient="records")
+
+
+def to_json_append_output():
+    # GH 35849
+    # Testing that resulting outputs read in as expected.
+    df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
+    df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
+    df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
+    df4 = DataFrame({"col4": [True, False]})
+
+    # Test 1, df1 and df2
+    expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
+    with tm.ensure_clean("test.json") as path:
+        # Save dataframes to the same file
+        df1.to_json(path, lines=True, orient="records")
+        df2.to_json(path, mode="a", lines=True, orient="records")
+
+        # Read path file
+        result = read_json(path, lines=True)
+        tm.assert_frame_equal(result, expected)
+
+    # Test 2: df1, df2, df3, df4 (in that order)
+    expected = DataFrame(
+        {
+            "col1": [1, 2, 3, 4, None, None, None, None],
+            "col2": ["a", "b", "c", "d", "e", "f", None, None],
+            "col3": [None, None, None, None, "!", "#", None, None],
+            "col4": [None, None, None, None, None, None, True, False],
+        }
+    )
+    with tm.ensure_clean("test.json") as path:
+        # Save dataframes to the same file
+        df1.to_json(path, mode="a", lines=True, orient="records")
+        df2.to_json(path, mode="a", lines=True, orient="records")
+        df3.to_json(path, mode="a", lines=True, orient="records")
+        df4.to_json(path, mode="a", lines=True, orient="records")
+
+        # Read path file
+        result = read_json(path, lines=True)
+        tm.assert_frame_equal(result, expected)
+
+    # Test 3: df4, df3, df2, df1 (in that order)
+    expected = DataFrame(
+        {
+            "col4": [True, False, None, None, None, None, None, None],
+            "col2": [None, None, "e", "f", "c", "d", "a", "b"],
+            "col3": [None, None, "!", "#", None, None, None, None],
+            "col1": [None, None, None, None, 3, 4, 1, 2],
+        }
+    )
+    with tm.ensure_clean("test.json") as path:
+        # Save dataframes to the same file
+        df4.to_json(path, mode="a", lines=True, orient="records")
+        df3.to_json(path, mode="a", lines=True, orient="records")
+        df2.to_json(path, mode="a", lines=True, orient="records")
+        df1.to_json(path, mode="a", lines=True, orient="records")
+
+        # Read path file
+        result = read_json(path, lines=True)
+        tm.assert_frame_equal(result, expected)