Skip to content

Commit c3f51a2

Browse files
authored
chore: throws NotImplementedError when read_pandas with nested JSON type (#1516)
* chore: throws NotImplementedError when read_pandas with nested JSON type * increase test coverage * fix python 3.6
1 parent 2818ab9 commit c3f51a2

File tree

2 files changed

+98
-1
lines changed

2 files changed

+98
-1
lines changed

bigframes/core/utils.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
from typing import Hashable, Iterable, List
1919
import warnings
2020

21+
import bigframes_vendored.constants as constants
2122
import bigframes_vendored.pandas.io.common as vendored_pandas_io_common
2223
import numpy as np
2324
import pandas as pd
2425
import pandas.api.types as pdtypes
26+
import pyarrow as pa
2527
import typing_extensions
2628

2729
import bigframes.dtypes as dtypes
@@ -243,6 +245,22 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
243245
return updated_columns
244246

245247

248+
def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
249+
"""
250+
Searches recursively for JSON array type within a PyArrow DataType.
251+
"""
252+
if arrow_type == dtypes.JSON_ARROW_TYPE:
253+
return True
254+
if pa.types.is_list(arrow_type):
255+
return _search_for_nested_json_type(arrow_type.value_type)
256+
if pa.types.is_struct(arrow_type):
257+
for i in range(arrow_type.num_fields):
258+
if _search_for_nested_json_type(arrow_type.field(i).type):
259+
return True
260+
return False
261+
return False
262+
263+
246264
def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
247265
"""
248266
Due to a BigQuery IO limitation with loading JSON from Parquet files (b/374784249),
@@ -253,12 +271,27 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
253271
updated_columns = []
254272

255273
for col in dataframe.columns:
256-
if dataframe[col].dtype == dtypes.JSON_DTYPE:
274+
column_type = dataframe[col].dtype
275+
if column_type == dtypes.JSON_DTYPE:
257276
dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
258277
updated_columns.append(col)
278+
elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
279+
column_type.pyarrow_dtype
280+
):
281+
raise NotImplementedError(
282+
f"Nested JSON types, found in column `{col}`: `{column_type}`', "
283+
f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
284+
)
259285

260286
if dataframe.index.dtype == dtypes.JSON_DTYPE:
261287
dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
262288
updated_columns.append(dataframe.index.name)
289+
elif isinstance(
290+
dataframe.index.dtype, pd.ArrowDtype
291+
) and _search_for_nested_json_type(dataframe.index.dtype.pyarrow_dtype):
292+
raise NotImplementedError(
293+
f"Nested JSON types, found in the index: `{dataframe.index.dtype}`', "
294+
f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
295+
)
263296

264297
return updated_columns

tests/system/small/test_session.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
import google.cloud.bigquery as bigquery
2727
import numpy as np
2828
import pandas as pd
29+
import pandas.arrays as arrays
30+
import pyarrow as pa
2931
import pytest
3032

3133
import bigframes
@@ -829,6 +831,68 @@ def test_read_pandas_json_index(session, write_engine):
829831
pd.testing.assert_index_equal(actual_result, expected_index)
830832

831833

834+
@pytest.mark.parametrize(
835+
("write_engine"),
836+
[
837+
pytest.param("default"),
838+
pytest.param("bigquery_load"),
839+
],
840+
)
841+
def test_read_pandas_w_nested_json(session, write_engine):
842+
data = [
843+
[{"json_field": "1"}],
844+
[{"json_field": None}],
845+
[{"json_field": '["1","3","5"]'}],
846+
[{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
847+
]
848+
# PyArrow currently lacks support for creating structs or lists containing extension types.
849+
# See issue: https://github.com/apache/arrow/issues/45262
850+
pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
851+
pd_s = pd.Series(
852+
arrays.ArrowExtensionArray(pa_array), # type: ignore
853+
dtype=pd.ArrowDtype(
854+
pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
855+
),
856+
)
857+
with pytest.raises(NotImplementedError, match="Nested JSON types, found in column"):
858+
# Until b/401630655 is resolved, json not compatible with allow_large_results=False
859+
session.read_pandas(pd_s, write_engine=write_engine).to_pandas(
860+
allow_large_results=True
861+
)
862+
863+
864+
@pytest.mark.parametrize(
865+
("write_engine"),
866+
[
867+
pytest.param("default"),
868+
pytest.param("bigquery_load"),
869+
],
870+
)
871+
def test_read_pandas_w_nested_json_index(session, write_engine):
872+
data = [
873+
[{"json_field": "1"}],
874+
[{"json_field": None}],
875+
[{"json_field": '["1","3","5"]'}],
876+
[{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
877+
]
878+
# PyArrow currently lacks support for creating structs or lists containing extension types.
879+
# See issue: https://github.com/apache/arrow/issues/45262
880+
pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
881+
pd_idx: pd.Index = pd.Index(
882+
arrays.ArrowExtensionArray(pa_array), # type: ignore
883+
dtype=pd.ArrowDtype(
884+
pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
885+
),
886+
)
887+
with pytest.raises(
888+
NotImplementedError, match="Nested JSON types, found in the index"
889+
):
890+
# Until b/401630655 is resolved, json not compatible with allow_large_results=False
891+
session.read_pandas(pd_idx, write_engine=write_engine).to_pandas(
892+
allow_large_results=True
893+
)
894+
895+
832896
@utils.skip_legacy_pandas
833897
@pytest.mark.parametrize(
834898
("write_engine",),

0 commit comments

Comments
 (0)