Skip to content

Commit dc9eb27

Browse files
feat: add bbq.json_query_array and warn bbq.json_extract_array deprecated (#1811)
* feat: add bbq.json_query_array and warn bbq.json_extract_array deprecated * complete features --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent 0562a37 commit dc9eb27

File tree

6 files changed

+155
-1
lines changed

6 files changed

+155
-1
lines changed

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
json_extract_array,
4141
json_extract_string_array,
4242
json_query,
43+
json_query_array,
4344
json_set,
4445
json_value,
4546
parse_json,
@@ -67,6 +68,7 @@
6768
"json_extract_array",
6869
"json_extract_string_array",
6970
"json_query",
71+
"json_query_array",
7072
"json_set",
7173
"json_value",
7274
"parse_json",

bigframes/bigquery/_operations/json.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,10 @@ def json_extract_array(
133133
`STRING` or `JSON` values. This function uses single quotes and brackets to
134134
escape invalid JSONPath characters in JSON keys.
135135
136+
.. deprecated:: 2.5.0
137+
The ``json_extract_array`` is deprecated and will be removed in a future version.
138+
Use ``json_query_array`` instead.
139+
136140
**Examples:**
137141
138142
>>> import bigframes.pandas as bpd
@@ -172,6 +176,11 @@ def json_extract_array(
172176
Returns:
173177
bigframes.series.Series: A new Series with the parsed arrays from the input.
174178
"""
179+
msg = (
180+
"The `json_extract_array` is deprecated and will be removed in a future version. "
181+
"Use `json_query_array` instead."
182+
)
183+
warnings.warn(bfe.format_message(msg), category=UserWarning)
175184
return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
176185

177186

@@ -273,6 +282,56 @@ def json_query(
273282
return input._apply_unary_op(ops.JSONQuery(json_path=json_path))
274283

275284

285+
def json_query_array(
286+
input: series.Series,
287+
json_path: str = "$",
288+
) -> series.Series:
289+
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted
290+
`STRING` or `JSON` values. This function uses double quotes to escape invalid
291+
JSONPath characters in JSON keys. For example: `"a.b"`.
292+
293+
**Examples:**
294+
295+
>>> import bigframes.pandas as bpd
296+
>>> import bigframes.bigquery as bbq
297+
>>> bpd.options.display.progress_bar = None
298+
299+
>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
300+
>>> bbq.json_query_array(s)
301+
0 ['1' '2' '3']
302+
1 ['4' '5']
303+
dtype: list<item: string>[pyarrow]
304+
305+
>>> s = bpd.Series([
306+
... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
307+
... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}'
308+
... ])
309+
>>> bbq.json_query_array(s, "$.fruits")
310+
0 ['{"name":"apple"}' '{"name":"cherry"}']
311+
1 ['{"name":"guava"}' '{"name":"grapes"}']
312+
dtype: list<item: string>[pyarrow]
313+
314+
>>> s = bpd.Series([
315+
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
316+
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
317+
... ])
318+
>>> bbq.json_query_array(s, "$.fruits.names")
319+
0 ['"apple"' '"cherry"']
320+
1 ['"guava"' '"grapes"']
321+
dtype: list<item: string>[pyarrow]
322+
323+
Args:
324+
input (bigframes.series.Series):
325+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
326+
json_path (str):
327+
The JSON path identifying the data that you want to obtain from the input.
328+
329+
Returns:
330+
bigframes.series.Series: A new Series with the parsed arrays from the input.
331+
"""
332+
return input._apply_unary_op(ops.JSONQueryArray(json_path=json_path))
333+
334+
276335
def json_value(
277336
input: series.Series,
278337
json_path: str,

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,6 +1379,19 @@ def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
13791379
return json_query_op(json_or_json_string=x, json_path=op.json_path)
13801380

13811381

1382+
@scalar_op_compiler.register_unary_op(ops.JSONQueryArray, pass_op=True)
1383+
def json_query_array_op_impl(x: ibis_types.Value, op: ops.JSONQueryArray):
1384+
# Define a user-defined function whose returned type is dynamically matching the input.
1385+
def json_query_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
1386+
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
1387+
...
1388+
1389+
return_type = x.type()
1390+
json_query_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore
1391+
json_query_op = ibis_udf.scalar.builtin(json_query_array)
1392+
return json_query_op(json_or_json_string=x, json_path=op.json_path)
1393+
1394+
13821395
@scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True)
13831396
def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON):
13841397
return parse_json(json_str=x)

bigframes/operations/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@
109109
JSONExtractArray,
110110
JSONExtractStringArray,
111111
JSONQuery,
112+
JSONQueryArray,
112113
JSONSet,
113114
JSONValue,
114115
ParseJSON,
@@ -359,6 +360,7 @@
359360
"JSONExtractArray",
360361
"JSONExtractStringArray",
361362
"JSONQuery",
363+
"JSONQueryArray",
362364
"JSONSet",
363365
"JSONValue",
364366
"ParseJSON",

bigframes/operations/json_ops.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,23 @@ def output_type(self, *input_types):
3737
return input_type
3838

3939

40+
@dataclasses.dataclass(frozen=True)
41+
class JSONQueryArray(base_ops.UnaryOp):
42+
name: typing.ClassVar[str] = "json_query_array"
43+
json_path: str
44+
45+
def output_type(self, *input_types):
46+
input_type = input_types[0]
47+
if not dtypes.is_json_like(input_type):
48+
raise TypeError(
49+
"Input type must be a valid JSON object or JSON-formatted string type."
50+
+ f" Received type: {input_type}"
51+
)
52+
return pd.ArrowDtype(
53+
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type))
54+
)
55+
56+
4057
@dataclasses.dataclass(frozen=True)
4158
class JSONExtractArray(base_ops.UnaryOp):
4259
name: typing.ClassVar[str] = "json_extract_array"

tests/system/small/bigquery/test_json.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ def test_json_extract_array_from_json():
128128
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"],
129129
dtype=dtypes.JSON_DTYPE,
130130
)
131-
actual = bbq.json_extract_array(s, "$.a")
131+
with pytest.warns(UserWarning, match="The `json_extract_array` is deprecated"):
132+
actual = bbq.json_extract_array(s, "$.a")
132133

133134
# This code provides a workaround for issue https://github.com/apache/arrow/issues/45262,
134135
# which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType())
@@ -241,6 +242,66 @@ def test_json_query_w_invalid_series_type():
241242
bbq.json_query(s, "$.a")
242243

243244

245+
def test_json_query_array_from_json():
246+
s = bpd.Series(
247+
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"],
248+
dtype=dtypes.JSON_DTYPE,
249+
)
250+
actual = bbq.json_query_array(s, "$.a")
251+
252+
# This code provides a workaround for issue https://github.com/apache/arrow/issues/45262,
253+
# which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType())
254+
sql = """
255+
SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data,
256+
UNION ALL
257+
SELECT 1, [],
258+
UNION ALL
259+
SELECT 2, [JSON '"4"', JSON '"5"'],
260+
UNION ALL
261+
SELECT 3, null,
262+
"""
263+
df = bpd.read_gbq(sql).set_index("id").sort_index()
264+
expected = df["data"]
265+
expected.index.name = None
266+
expected.name = None
267+
268+
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
269+
270+
271+
def test_json_query_array_from_json_strings():
272+
s = bpd.Series(
273+
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"],
274+
dtype=pd.StringDtype(storage="pyarrow"),
275+
)
276+
actual = bbq.json_query_array(s, "$.a")
277+
expected = bpd.Series(
278+
[['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
279+
dtype=pd.ArrowDtype(pa.list_(pa.string())),
280+
)
281+
282+
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
283+
284+
285+
def test_json_query_array_from_json_array_strings():
286+
s = bpd.Series(
287+
["[1, 2, 3]", "[]", "[4,5]"],
288+
dtype=pd.StringDtype(storage="pyarrow"),
289+
)
290+
actual = bbq.json_query_array(s)
291+
expected = bpd.Series(
292+
[["1", "2", "3"], [], ["4", "5"]],
293+
dtype=pd.ArrowDtype(pa.list_(pa.string())),
294+
)
295+
296+
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
297+
298+
299+
def test_json_query_array_w_invalid_series_type():
300+
s = bpd.Series([1, 2])
301+
with pytest.raises(TypeError):
302+
bbq.json_query_array(s)
303+
304+
244305
def test_json_value_from_json():
245306
s = bpd.Series(
246307
['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],

0 commit comments

Comments
 (0)