Skip to content

test: fix json tests fail locally with disable allow_large_results #1523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 50 additions & 41 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,6 @@
import bigframes.pandas as bpd


@pytest.fixture(scope="module", autouse=True)
def use_large_query_path():
# b/401630655
with bpd.option_context("bigquery.allow_large_results", True):
yield


@pytest.mark.parametrize(
("json_path", "expected_json"),
[
Expand All @@ -39,12 +32,14 @@ def use_large_query_path():
def test_json_set_at_json_path(json_path, expected_json):
original_json = ['{"a": {"b": {"c": "tester", "d": []}}}']
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)])

actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)])
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


Expand All @@ -63,11 +58,12 @@ def test_json_set_at_json_value_type(json_value, expected_json):
original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}']
s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)])

expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


Expand All @@ -80,18 +76,14 @@ def test_json_set_w_more_pairs():

expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}']
expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


def test_json_set_w_invalid_json_path_value_pairs():
s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
with pytest.raises(ValueError):
bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore


def test_json_set_w_invalid_value_type():
s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE)
with pytest.raises(TypeError):
Expand Down Expand Up @@ -119,11 +111,13 @@ def test_json_extract_from_json():
['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],
dtype=dtypes.JSON_DTYPE,
)
actual = bbq.json_extract(s, "$.a.b").to_pandas()
expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas()
actual = bbq.json_extract(s, "$.a.b")
expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual,
expected,
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


Expand All @@ -134,9 +128,11 @@ def test_json_extract_from_string():
)
actual = bbq.json_extract(s, "$.a.b")
expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow"))

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


Expand Down Expand Up @@ -169,9 +165,10 @@ def test_json_extract_array_from_json():
expected.index.name = None
expected.name = None

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


Expand All @@ -185,9 +182,11 @@ def test_json_extract_array_from_json_strings():
[['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
dtype=pd.ArrowDtype(pa.list_(pa.string())),
)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


Expand All @@ -201,9 +200,11 @@ def test_json_extract_array_from_json_array_strings():
[["1", "2", "3"], [], ["4", "5"]],
dtype=pd.ArrowDtype(pa.list_(pa.string())),
)

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


Expand All @@ -217,37 +218,45 @@ def test_json_extract_string_array_from_json_strings():
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
actual = bbq.json_extract_string_array(s, "$.a")
expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]])

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


def test_json_extract_string_array_from_array_strings():
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_string_array(s)
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


def test_json_extract_string_array_as_float_array_from_array_strings():
s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE)
expected = bpd.Series([[1, 2.5, 3], [], [4, 5]])

# TODO(b/401630655): JSON is not compatible with allow_large_results=False
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
actual.to_pandas(allow_large_results=True),
expected.to_pandas(allow_large_results=True),
)


def test_json_extract_string_array_w_invalid_series_type():
s = bpd.Series([1, 2])
with pytest.raises(TypeError):
bbq.json_extract_string_array(bpd.Series([1, 2]))
bbq.json_extract_string_array(s)


def test_parse_json_w_invalid_series_type():
s = bpd.Series([1, 2])
with pytest.raises(TypeError):
bbq.parse_json(bpd.Series([1, 2]))
bbq.parse_json(s)
138 changes: 0 additions & 138 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from typing import Tuple

import db_dtypes # type:ignore
import google.api_core.exceptions
import pandas as pd
import pandas.testing
Expand Down Expand Up @@ -281,143 +280,6 @@ def test_to_arrow_override_global_option(scalars_df_index):
assert scalars_df_index._query_job.destination.table_id == table_id


def test_load_json_w_json_string_items(session):
sql = """
SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col,
UNION ALL
SELECT 1, JSON_OBJECT('int', 100),
UNION ALL
SELECT 2, JSON_OBJECT('float', 0.98),
UNION ALL
SELECT 3, JSON_OBJECT('string', 'hello world'),
UNION ALL
SELECT 4, JSON_OBJECT('array', [8, 9, 10]),
UNION ALL
SELECT 5, JSON_OBJECT('null', null),
UNION ALL
SELECT 6, JSON_OBJECT('b', 2, 'a', 1),
UNION ALL
SELECT
7,
JSON_OBJECT(
'dict',
JSON_OBJECT(
'int', 1,
'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')]
)
),
"""
df = session.read_gbq(sql, index_col="id")

assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())

assert df["json_col"][0] == '{"boolean":true}'
assert df["json_col"][1] == '{"int":100}'
assert df["json_col"][2] == '{"float":0.98}'
assert df["json_col"][3] == '{"string":"hello world"}'
assert df["json_col"][4] == '{"array":[8,9,10]}'
assert df["json_col"][5] == '{"null":null}'

# Verifies JSON strings preserve array order, regardless of dictionary key order.
assert df["json_col"][6] == '{"a":1,"b":2}'
assert df["json_col"][7] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}'


def test_load_json_to_pandas_has_correct_result(session):
df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col")
assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType())
result = df.to_pandas()

# These JSON strings are compatible with BigQuery's JSON storage,
pd_df = pd.DataFrame(
{"json_col": ['{"bar":true,"foo":10}']},
dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()),
)
pd_df.index = pd_df.index.astype("Int64")
pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes)
pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"])


def test_load_json_in_struct(session):
"""Avoid regressions for internal issue 381148539."""
sql = """
SELECT 0 AS id, STRUCT(JSON_OBJECT('boolean', True) AS data, 1 AS number) AS struct_col
UNION ALL
SELECT 1, STRUCT(JSON_OBJECT('int', 100), 2),
UNION ALL
SELECT 2, STRUCT(JSON_OBJECT('float', 0.98), 3),
UNION ALL
SELECT 3, STRUCT(JSON_OBJECT('string', 'hello world'), 4),
UNION ALL
SELECT 4, STRUCT(JSON_OBJECT('array', [8, 9, 10]), 5),
UNION ALL
SELECT 5, STRUCT(JSON_OBJECT('null', null), 6),
UNION ALL
SELECT
6,
STRUCT(JSON_OBJECT(
'dict',
JSON_OBJECT(
'int', 1,
'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')]
)
), 7),
"""
df = session.read_gbq(sql, index_col="id")

assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype)
assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType)

data = df["struct_col"].struct.field("data")
assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())

assert data[0] == '{"boolean":true}'
assert data[1] == '{"int":100}'
assert data[2] == '{"float":0.98}'
assert data[3] == '{"string":"hello world"}'
assert data[4] == '{"array":[8,9,10]}'
assert data[5] == '{"null":null}'
assert data[6] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}'


def test_load_json_in_array(session):
sql = """
SELECT
0 AS id,
[
JSON_OBJECT('boolean', True),
JSON_OBJECT('int', 100),
JSON_OBJECT('float', 0.98),
JSON_OBJECT('string', 'hello world'),
JSON_OBJECT('array', [8, 9, 10]),
JSON_OBJECT('null', null),
JSON_OBJECT(
'dict',
JSON_OBJECT(
'int', 1,
'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)]
)
)
] AS array_col,
"""
df = session.read_gbq(sql, index_col="id")

assert isinstance(df.dtypes["array_col"], pd.ArrowDtype)
assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType)

data = df["array_col"].list
assert data.len()[0] == 7
assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType())

assert data[0][0] == '{"boolean":true}'
assert data[1][0] == '{"int":100}'
assert data[2][0] == '{"float":0.98}'
assert data[3][0] == '{"string":"hello world"}'
assert data[4][0] == '{"array":[8,9,10]}'
assert data[5][0] == '{"null":null}'
assert data[6][0] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}'


def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index):
"""Verify to_pandas_batches() APIs returns the expected dtypes."""
expected = scalars_df_default_index.dtypes
Expand Down
Loading