chore: throws NotImplementedError when read_pandas with nested JSON type (#1516)

chelsea-lin · web-flow · commit c3f51a2a8835 · 2025-03-20T15:57:50.000-07:00
* chore: throws NotImplementedError when read_pandas with nested JSON type

* increase test coverage

* fix python 3.6
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
@@ -18,10 +18,12 @@
 from typing import Hashable, Iterable, List
 import warnings
 
+import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.io.common as vendored_pandas_io_common
 import numpy as np
 import pandas as pd
 import pandas.api.types as pdtypes
+import pyarrow as pa
 import typing_extensions
 
 import bigframes.dtypes as dtypes
@@ -243,6 +245,22 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
     return updated_columns
 
 
+def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
+    """
+    Searches recursively for JSON array type within a PyArrow DataType.
+    """
+    if arrow_type == dtypes.JSON_ARROW_TYPE:
+        return True
+    if pa.types.is_list(arrow_type):
+        return _search_for_nested_json_type(arrow_type.value_type)
+    if pa.types.is_struct(arrow_type):
+        for i in range(arrow_type.num_fields):
+            if _search_for_nested_json_type(arrow_type.field(i).type):
+                return True
+        return False
+    return False
+
+
 def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
     """
     Due to a BigQuery IO limitation with loading JSON from Parquet files (b/374784249),
@@ -253,12 +271,27 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
     updated_columns = []
 
     for col in dataframe.columns:
-        if dataframe[col].dtype == dtypes.JSON_DTYPE:
+        column_type = dataframe[col].dtype
+        if column_type == dtypes.JSON_DTYPE:
             dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
             updated_columns.append(col)
+        elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
+            column_type.pyarrow_dtype
+        ):
+            raise NotImplementedError(
+                f"Nested JSON types, found in column `{col}`: `{column_type}`', "
+                f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
+            )
 
     if dataframe.index.dtype == dtypes.JSON_DTYPE:
         dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
         updated_columns.append(dataframe.index.name)
+    elif isinstance(
+        dataframe.index.dtype, pd.ArrowDtype
+    ) and _search_for_nested_json_type(dataframe.index.dtype.pyarrow_dtype):
+        raise NotImplementedError(
+            f"Nested JSON types, found in the index: `{dataframe.index.dtype}`', "
+            f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
+        )
 
     return updated_columns
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -26,6 +26,8 @@
 import google.cloud.bigquery as bigquery
 import numpy as np
 import pandas as pd
+import pandas.arrays as arrays
+import pyarrow as pa
 import pytest
 
 import bigframes
@@ -829,6 +831,68 @@ def test_read_pandas_json_index(session, write_engine):
     pd.testing.assert_index_equal(actual_result, expected_index)
 
 
+@pytest.mark.parametrize(
+    ("write_engine"),
+    [
+        pytest.param("default"),
+        pytest.param("bigquery_load"),
+    ],
+)
+def test_read_pandas_w_nested_json(session, write_engine):
+    data = [
+        [{"json_field": "1"}],
+        [{"json_field": None}],
+        [{"json_field": '["1","3","5"]'}],
+        [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
+    ]
+    # PyArrow currently lacks support for creating structs or lists containing extension types.
+    # See issue: https://github.com/apache/arrow/issues/45262
+    pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
+    pd_s = pd.Series(
+        arrays.ArrowExtensionArray(pa_array),  # type: ignore
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
+        ),
+    )
+    with pytest.raises(NotImplementedError, match="Nested JSON types, found in column"):
+        # Until b/401630655 is resolved, json not compatible with allow_large_results=False
+        session.read_pandas(pd_s, write_engine=write_engine).to_pandas(
+            allow_large_results=True
+        )
+
+
+@pytest.mark.parametrize(
+    ("write_engine"),
+    [
+        pytest.param("default"),
+        pytest.param("bigquery_load"),
+    ],
+)
+def test_read_pandas_w_nested_json_index(session, write_engine):
+    data = [
+        [{"json_field": "1"}],
+        [{"json_field": None}],
+        [{"json_field": '["1","3","5"]'}],
+        [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
+    ]
+    # PyArrow currently lacks support for creating structs or lists containing extension types.
+    # See issue: https://github.com/apache/arrow/issues/45262
+    pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
+    pd_idx: pd.Index = pd.Index(
+        arrays.ArrowExtensionArray(pa_array),  # type: ignore
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
+        ),
+    )
+    with pytest.raises(
+        NotImplementedError, match="Nested JSON types, found in the index"
+    ):
+        # Until b/401630655 is resolved, json not compatible with allow_large_results=False
+        session.read_pandas(pd_idx, write_engine=write_engine).to_pandas(
+            allow_large_results=True
+        )
+
+
 @utils.skip_legacy_pandas
 @pytest.mark.parametrize(
     ("write_engine",),