Skip to content

Commit c40452c

Browse files
committed
feat: reading JSON data as the pyarrow JSON type when available
1 parent 578081e commit c40452c

File tree

8 files changed

+50
-21
lines changed

8 files changed

+50
-21
lines changed

bigframes/bigquery/_operations/json.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def json_set(
5353
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
5454
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
5555
0 {"a":100,"b":"hi"}
56-
Name: data, dtype: extension<dbjson<JSONArrowType>>[pyarrow]
56+
Name: data, dtype: extension<arrow.json>[pyarrow]
5757
5858
Args:
5959
input (bigframes.series.Series):
@@ -253,7 +253,7 @@ def parse_json(
253253
dtype: string
254254
>>> bbq.parse_json(s)
255255
0 {"class":{"students":[{"id":5},{"id":12}]}}
256-
dtype: extension<dbjson<JSONArrowType>>[pyarrow]
256+
dtype: extension<arrow.json>[pyarrow]
257257
258258
Args:
259259
input (bigframes.series.Series):

bigframes/core/array_value.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ def from_table(
108108
raise ValueError("must set at most one of 'offests', 'primary_key'")
109109
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
110110
msg = bfe.format_message(
111-
"JSON column interpretation as a custom PyArrow extention in `db_dtypes` "
112-
"is a preview feature and subject to change."
111+
"JSON column interpretation as a PyArrow JSON extention type is a preview "
112+
"feature and subject to change."
113113
)
114114
warnings.warn(msg, bfe.PreviewWarning)
115115
# define data source only for needed columns, this makes row-hashing cheaper

bigframes/core/compile/ibis_types.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
dtype as python_type_to_ibis_type,
2525
)
2626
import bigframes_vendored.ibis.expr.types as ibis_types
27-
import db_dtypes # type: ignore
2827
import geopandas as gpd # type: ignore
2928
import google.cloud.bigquery as bigquery
3029
import pandas as pd
@@ -75,7 +74,7 @@
7574
IBIS_GEO_TYPE,
7675
gpd.array.GeometryDtype(),
7776
),
78-
(ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())),
77+
(ibis_dtypes.json, bigframes.dtypes.JSON_DTYPE),
7978
)
8079

8180
BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {

bigframes/core/utils.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,15 @@ def timedelta_to_micros(
224224
raise TypeError(f"Unrecognized input type: {type(timedelta)}")
225225

226226

227+
def _is_timedelat64_dtype(dtype: dtypes.Dtype) -> bool:
228+
try:
229+
return pdtypes.is_timedelta64_dtype(dtype)
230+
except NotImplementedError:
231+
# Workaround the known issue in pandas:
232+
# https://github.com/pandas-dev/pandas/issues/60958
233+
return False
234+
235+
227236
def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
228237
"""
229238
Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored.
@@ -234,11 +243,11 @@ def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
234243
updated_columns = []
235244

236245
for col in dataframe.columns:
237-
if pdtypes.is_timedelta64_dtype(dataframe[col].dtype):
246+
if _is_timedelat64_dtype(dataframe[col].dtype):
238247
dataframe[col] = dataframe[col].apply(timedelta_to_micros)
239248
updated_columns.append(col)
240249

241-
if pdtypes.is_timedelta64_dtype(dataframe.index.dtype):
250+
if _is_timedelat64_dtype(dataframe.index.dtype):
242251
dataframe.index = dataframe.index.map(timedelta_to_micros)
243252
updated_columns.append(dataframe.index.name)
244253

@@ -249,15 +258,15 @@ def _search_for_nested_json_type(arrow_type: pa.DataType) -> bool:
249258
"""
250259
Searches recursively for JSON array type within a PyArrow DataType.
251260
"""
252-
if arrow_type == dtypes.JSON_ARROW_TYPE:
253-
return True
254261
if pa.types.is_list(arrow_type):
255262
return _search_for_nested_json_type(arrow_type.value_type)
256263
if pa.types.is_struct(arrow_type):
257264
for i in range(arrow_type.num_fields):
258265
if _search_for_nested_json_type(arrow_type.field(i).type):
259266
return True
260267
return False
268+
if dtypes.is_json_arrow_type(arrow_type):
269+
return True
261270
return False
262271

263272

@@ -272,7 +281,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
272281

273282
for col in dataframe.columns:
274283
column_type = dataframe[col].dtype
275-
if column_type == dtypes.JSON_DTYPE:
284+
if dtypes.is_json_type(column_type):
276285
dataframe[col] = dataframe[col].astype(dtypes.STRING_DTYPE)
277286
updated_columns.append(col)
278287
elif isinstance(column_type, pd.ArrowDtype) and _search_for_nested_json_type(
@@ -283,7 +292,7 @@ def replace_json_with_string(dataframe: pd.DataFrame) -> List[str]:
283292
f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
284293
)
285294

286-
if dataframe.index.dtype == dtypes.JSON_DTYPE:
295+
if dtypes.is_json_type(dataframe.index.dtype):
287296
dataframe.index = dataframe.index.astype(dtypes.STRING_DTYPE)
288297
updated_columns.append(dataframe.index.name)
289298
elif isinstance(

bigframes/dtypes.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,9 @@
6262
# No arrow equivalent
6363
GEO_DTYPE = gpd.array.GeometryDtype()
6464
# JSON
65-
# TODO: switch to pyarrow.json_(pyarrow.string()) when available.
66-
JSON_ARROW_TYPE = db_dtypes.JSONArrowType()
65+
JSON_ARROW_TYPE = (
66+
pa.json_(pa.string()) if hasattr(pa, "JsonType") else db_dtypes.JSONArrowType()
67+
)
6768
JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE)
6869
OBJ_REF_DTYPE = pd.ArrowDtype(
6970
pa.struct(
@@ -169,7 +170,7 @@ class SimpleDtypeInfo:
169170
),
170171
SimpleDtypeInfo(
171172
dtype=JSON_DTYPE,
172-
arrow_dtype=db_dtypes.JSONArrowType(),
173+
arrow_dtype=JSON_ARROW_TYPE,
173174
type_kind=("JSON",),
174175
orderable=False,
175176
clusterable=False,
@@ -330,8 +331,18 @@ def is_struct_like(type_: ExpressionType) -> bool:
330331
)
331332

332333

334+
def is_json_arrow_type(type_: pa.DataType) -> bool:
335+
return (hasattr(pa, "JsonType") and isinstance(type_, pa.JsonType)) or (
336+
not hasattr(pa, "JsonType") and isinstance(type_, db_dtypes.JSONArrowType)
337+
)
338+
339+
340+
def is_json_type(type_: ExpressionType) -> bool:
341+
return isinstance(type_, pd.ArrowDtype) and is_json_arrow_type(type_.pyarrow_dtype)
342+
343+
333344
def is_json_like(type_: ExpressionType) -> bool:
334-
return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string
345+
return is_json_type(type_) or type_ == STRING_DTYPE # Including JSON string
335346

336347

337348
def is_json_encoding_type(type_: ExpressionType) -> bool:

bigframes/session/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,12 @@ def _read_pandas_inline(
814814
) -> dataframe.DataFrame:
815815
import bigframes.dataframe as dataframe
816816

817-
memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
817+
try:
818+
memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
819+
except NotImplementedError: # TODO: add unit test
820+
# Workaround the known issue in pandas:
821+
# https://github.com/pandas-dev/pandas/issues/60958
822+
raise ValueError("Could not determine the DataFrame's memory usage.")
818823
if memory_usage > MAX_INLINE_DF_BYTES:
819824
raise ValueError(
820825
f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed "

tests/system/small/test_series.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import re
1818
import tempfile
1919

20-
import db_dtypes # type: ignore
2120
import geopandas as gpd # type: ignore
2221
import numpy
2322
from packaging.version import Version
@@ -384,9 +383,9 @@ def test_get_column(scalars_dfs, col_name, expected_dtype):
384383

385384
def test_get_column_w_json(json_df, json_pandas_df):
386385
series = json_df["json_col"]
386+
assert dtypes.is_json_type(series.dtype)
387387
# Until b/401630655 is resolved, json not compatible with allow_large_results=False
388388
series_pandas = series.to_pandas(allow_large_results=True)
389-
assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())
390389
assert series_pandas.shape[0] == json_pandas_df.shape[0]
391390

392391

tests/system/small/test_session.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,11 @@ def test_read_pandas_json_dataframes(session, write_engine):
933933

934934
if write_engine == "bigquery_streaming":
935935
expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64")
936-
pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False)
936+
# `check_exact=False` can workaround the known issue in pandas:
937+
# https://github.com/pandas-dev/pandas/issues/60958
938+
pd.testing.assert_frame_equal(
939+
actual_result, expected_df, check_index_type=False, check_exact=False
940+
)
937941

938942

939943
@pytest.mark.parametrize(
@@ -953,8 +957,10 @@ def test_read_pandas_json_series(session, write_engine):
953957
actual_result = session.read_pandas(
954958
expected_series, write_engine=write_engine
955959
).to_pandas(allow_large_results=True)
960+
# `check_exact=False` can workaround the known issue in pandas:
961+
# https://github.com/pandas-dev/pandas/issues/60958
956962
pd.testing.assert_series_equal(
957-
actual_result, expected_series, check_index_type=False
963+
actual_result, expected_series, check_index_type=False, check_exact=False
958964
)
959965

960966

0 commit comments

Comments
 (0)