feat: reading JSON data as the pyarrow JSON type when available

chelsea-lin · chelsea-lin · commit c40452c5d6a2 · 2025-03-24T17:51:55.000Z
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
@@ -53,7 +53,7 @@ def json_set(
         >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
         >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
             0    {"a":100,"b":"hi"}
-            Name: data, dtype: extension<dbjson<JSONArrowType>>[pyarrow]
+            Name: data, dtype: extension<arrow.json>[pyarrow]
 
     Args:
         input (bigframes.series.Series):
@@ -253,7 +253,7 @@ def parse_json(
         dtype: string
         >>> bbq.parse_json(s)
         0    {"class":{"students":[{"id":5},{"id":12}]}}
-        dtype: extension<dbjson<JSONArrowType>>[pyarrow]
+        dtype: extension<arrow.json>[pyarrow]
 
     Args:
         input (bigframes.series.Series):
diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py
@@ -108,8 +108,8 @@ def from_table(
             raise ValueError("must set at most one of 'offests', 'primary_key'")
         if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
             msg = bfe.format_message(
-                "JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
-                "is a preview feature and subject to change."
+                "JSON column interpretation as a PyArrow JSON extention type is a preview "
+                "feature and subject to change."
             )
             warnings.warn(msg, bfe.PreviewWarning)
         # define data source only for needed columns, this makes row-hashing cheaper
diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
@@ -24,7 +24,6 @@
     dtype as python_type_to_ibis_type,
 )
 import bigframes_vendored.ibis.expr.types as ibis_types
-import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
 import pandas as pd
@@ -75,7 +74,7 @@
         IBIS_GEO_TYPE,
         gpd.array.GeometryDtype(),
     ),
-    (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
+    (ibis_dtypes.json, bigframes.dtypes.JSON_DTYPE),
 )
 
 BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
@@ -224,6 +224,15 @@ def timedelta_to_micros(
     raise TypeError(f"Unrecognized input type: {type(timedelta)}")
 
 
+def _is_timedelat64_dtype(dtype: dtypes.Dtype) -> bool:
+    try:
+        return pdtypes.is_timedelta64_dtype(dtype)
+    except NotImplementedError:
+        # Workaround the known issue in pandas:
+        # https://github.com/pandas-dev/pandas/issues/60958
+        return False
+
+
 def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
     """
     Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored.
@@ -234,11 +243,11 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
     updated_columns = []
 
     for col in dataframe.columns:
-        if pdtypes.is_timedelta64_dtype(dataframe[col].dtype):
+        if _is_timedelat64_dtype(dataframe[col].dtype):
             dataframe[col] = dataframe[col].apply(timedelta_to_micros)
             updated_columns.append(col)
 
-    if pdtypes.is_timedelta64_dtype(dataframe.index.dtype):
+    if _is_timedelat64_dtype(dataframe.index.dtype):
         dataframe.index = dataframe.index.map(timedelta_to_micros)
         updated_columns.append(dataframe.index.name)
 
@@ -249,15 +258,15 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
     """
     Searches recursively for JSON array type within a PyArrow DataType.
     """
-    if arrow_type == dtypes.JSON_ARROW_TYPE:
-        return True
     if pa.types.is_list(arrow_type):
         return _search_for_nested_json_type(arrow_type.value_type)
     if pa.types.is_struct(arrow_type):
         for i in range(arrow_type.num_fields):
             if _search_for_nested_json_type(arrow_type.field(i).type):
                 return True
         return False
+    if dtypes.is_json_arrow_type(arrow_type):
+        return True
     return False
 
 
@@ -272,7 +281,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
 
     for col in dataframe.columns:
         column_type = dataframe[col].dtype
-        if column_type == dtypes.JSON_DTYPE:
+        if dtypes.is_json_type(column_type):
             dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
             updated_columns.append(col)
         elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
@@ -283,7 +292,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
                 f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
             )
 
-    if dataframe.index.dtype == dtypes.JSON_DTYPE:
+    if dtypes.is_json_type(dataframe.index.dtype):
         dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
         updated_columns.append(dataframe.index.name)
     elif isinstance(
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -62,8 +62,9 @@
 # No arrow equivalent
 GEO_DTYPE = gpd.array.GeometryDtype()
 # JSON
-# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
-JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
+JSON_ARROW_TYPE = (
+    pa.json_(pa.string()) if hasattr(pa, "JsonType") else db_dtypes.JSONArrowType()
+)
 JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
 OBJ_REF_DTYPE = pd.ArrowDtype(
     pa.struct(
@@ -169,7 +170,7 @@ class SimpleDtypeInfo:
     ),
     SimpleDtypeInfo(
         dtype=JSON_DTYPE,
-        arrow_dtype=db_dtypes.JSONArrowType(),
+        arrow_dtype=JSON_ARROW_TYPE,
         type_kind=("JSON",),
         orderable=False,
         clusterable=False,
@@ -330,8 +331,18 @@ def is_struct_like(type_: ExpressionType) -> bool:
     )
 
 
+def is_json_arrow_type(type_: pa.DataType) -> bool:
+    return (hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType)) or (
+        not hasattr(pa, "JsonType") and isinstance(type_, db_dtypes.JSONArrowType)
+    )
+
+
+def is_json_type(type_: ExpressionType) -> bool:
+    return isinstance(type_, pd.ArrowDtype) and is_json_arrow_type(type_.pyarrow_dtype)
+
+
 def is_json_like(type_: ExpressionType) -> bool:
-    return type_ == JSON_DTYPE or type_ == STRING_DTYPE  # Including JSON string
+    return is_json_type(type_) or type_ == STRING_DTYPE  # Including JSON string
 
 
 def is_json_encoding_type(type_: ExpressionType) -> bool:
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -814,7 +814,12 @@ def _read_pandas_inline(
     ) -> dataframe.DataFrame:
         import bigframes.dataframe as dataframe
 
-        memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
+        try:
+            memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
+        except NotImplementedError:  # TODO: add unit test
+            # Workaround the known issue in pandas:
+            # https://github.com/pandas-dev/pandas/issues/60958
+            raise ValueError("Could not determine the DataFrame's memory usage.")
         if memory_usage > MAX_INLINE_DF_BYTES:
             raise ValueError(
                 f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed "
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -17,7 +17,6 @@
 import re
 import tempfile
 
-import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import numpy
 from packaging.version import Version
@@ -384,9 +383,9 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):
 
 def test_get_column_w_json(json_df, json_pandas_df):
     series = json_df["json_col"]
+    assert dtypes.is_json_type(series.dtype)
     # Until b/401630655 is resolved, json not compatible with allow_large_results=False
     series_pandas = series.to_pandas(allow_large_results=True)
-    assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
     assert series_pandas.shape[0] == json_pandas_df.shape[0]
 
 
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -933,7 +933,11 @@ def test_read_pandas_json_dataframes(session, write_engine):
 
     if write_engine == "bigquery_streaming":
         expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64")
-    pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False)
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
+    pd.testing.assert_frame_equal(
+        actual_result, expected_df, check_index_type=False, check_exact=False
+    )
 
 
 @pytest.mark.parametrize(
@@ -953,8 +957,10 @@ def test_read_pandas_json_series(session, write_engine):
     actual_result = session.read_pandas(
         expected_series, write_engine=write_engine
     ).to_pandas(allow_large_results=True)
+    # `check_exact=False` can workaround the known issue in pandas:
+    # https://github.com/pandas-dev/pandas/issues/60958
     pd.testing.assert_series_equal(
-        actual_result, expected_series, check_index_type=False
+        actual_result, expected_series, check_index_type=False, check_exact=False
     )