diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 16f66dae57..57fc878643 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -22,13 +22,6 @@ import bigframes.pandas as bpd -@pytest.fixture(scope="module", autouse=True) -def use_large_query_path(): - # b/401630655 - with bpd.option_context("bigquery.allow_large_results", True): - yield - - @pytest.mark.parametrize( ("json_path", "expected_json"), [ @@ -39,12 +32,14 @@ def use_large_query_path(): def test_json_set_at_json_path(json_path, expected_json): original_json = ['{"a": {"b": {"c": "tester", "d": []}}}'] s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) - actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) + actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -63,11 +58,12 @@ def test_json_set_at_json_value_type(json_value, expected_json): original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}'] s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) - expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -80,18 +76,14 @@ def test_json_set_w_more_pairs(): expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}'] expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) -def test_json_set_w_invalid_json_path_value_pairs(): - s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) - with pytest.raises(ValueError): - bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore - - def test_json_set_w_invalid_value_type(): s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) with pytest.raises(TypeError): @@ -119,11 +111,13 @@ def test_json_extract_from_json(): ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], dtype=dtypes.JSON_DTYPE, ) - actual = bbq.json_extract(s, "$.a.b").to_pandas() - expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas() + actual = bbq.json_extract(s, "$.a.b") + expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual, - expected, + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -134,9 +128,11 @@ def test_json_extract_from_string(): ) actual = bbq.json_extract(s, "$.a.b") expected = bpd.Series(["[1,2]", None, "0"], dtype=pd.StringDtype(storage="pyarrow")) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -169,9 +165,10 @@ def test_json_extract_array_from_json(): expected.index.name = None expected.name = None + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -185,9 +182,11 @@ def test_json_extract_array_from_json_strings(): [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], dtype=pd.ArrowDtype(pa.list_(pa.string())), ) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -201,9 +200,11 @@ def test_json_extract_array_from_json_array_strings(): [["1", "2", "3"], [], ["4", "5"]], dtype=pd.ArrowDtype(pa.list_(pa.string())), ) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -217,9 +218,11 @@ def test_json_extract_string_array_from_json_strings(): s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) actual = bbq.json_extract_string_array(s, "$.a") expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]]) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -227,9 +230,11 @@ def test_json_extract_string_array_from_array_strings(): s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) actual = bbq.json_extract_string_array(s) expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) @@ -237,17 +242,21 @@ def test_json_extract_string_array_as_float_array_from_array_strings(): s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"]) actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE) expected = bpd.Series([[1, 2.5, 3], [], [4, 5]]) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False pd.testing.assert_series_equal( - actual.to_pandas(), - expected.to_pandas(), + actual.to_pandas(allow_large_results=True), + expected.to_pandas(allow_large_results=True), ) def test_json_extract_string_array_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_extract_string_array(bpd.Series([1, 2])) + bbq.json_extract_string_array(s) def test_parse_json_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.parse_json(bpd.Series([1, 2])) + bbq.parse_json(s) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index d2ba96b41f..cd21f5094c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -14,7 +14,6 @@ from typing import Tuple -import db_dtypes # type:ignore import google.api_core.exceptions import pandas as pd import pandas.testing @@ -281,143 +280,6 @@ def test_to_arrow_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id -def test_load_json_w_json_string_items(session): - sql = """ - SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, - UNION ALL - SELECT 1, JSON_OBJECT('int', 100), - UNION ALL - SELECT 2, JSON_OBJECT('float', 0.98), - UNION ALL - SELECT 3, JSON_OBJECT('string', 'hello world'), - UNION ALL - SELECT 4, JSON_OBJECT('array', [8, 9, 10]), - UNION ALL - SELECT 5, JSON_OBJECT('null', null), - UNION ALL - SELECT 6, JSON_OBJECT('b', 2, 'a', 1), - UNION ALL - SELECT - 7, - JSON_OBJECT( - 'dict', - JSON_OBJECT( - 'int', 1, - 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] - ) - ), - """ - df = session.read_gbq(sql, index_col="id") - - assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) - - assert df["json_col"][0] == '{"boolean":true}' - assert df["json_col"][1] == '{"int":100}' - assert df["json_col"][2] == '{"float":0.98}' - assert df["json_col"][3] == '{"string":"hello world"}' - assert df["json_col"][4] == '{"array":[8,9,10]}' - assert df["json_col"][5] == '{"null":null}' - - # Verifies JSON strings preserve array order, regardless of dictionary key order. - assert df["json_col"][6] == '{"a":1,"b":2}' - assert df["json_col"][7] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' - - -def test_load_json_to_pandas_has_correct_result(session): - df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") - assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) - result = df.to_pandas() - - # These JSON strings are compatible with BigQuery's JSON storage, - pd_df = pd.DataFrame( - {"json_col": ['{"bar":true,"foo":10}']}, - dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), - ) - pd_df.index = pd_df.index.astype("Int64") - pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) - pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) - - -def test_load_json_in_struct(session): - """Avoid regressions for internal issue 381148539.""" - sql = """ - SELECT 0 AS id, STRUCT(JSON_OBJECT('boolean', True) AS data, 1 AS number) AS struct_col - UNION ALL - SELECT 1, STRUCT(JSON_OBJECT('int', 100), 2), - UNION ALL - SELECT 2, STRUCT(JSON_OBJECT('float', 0.98), 3), - UNION ALL - SELECT 3, STRUCT(JSON_OBJECT('string', 'hello world'), 4), - UNION ALL - SELECT 4, STRUCT(JSON_OBJECT('array', [8, 9, 10]), 5), - UNION ALL - SELECT 5, STRUCT(JSON_OBJECT('null', null), 6), - UNION ALL - SELECT - 6, - STRUCT(JSON_OBJECT( - 'dict', - JSON_OBJECT( - 'int', 1, - 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] - ) - ), 7), - """ - df = session.read_gbq(sql, index_col="id") - - assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype) - assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) - - data = df["struct_col"].struct.field("data") - assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - - assert data[0] == '{"boolean":true}' - assert data[1] == '{"int":100}' - assert data[2] == '{"float":0.98}' - assert data[3] == '{"string":"hello world"}' - assert data[4] == '{"array":[8,9,10]}' - assert data[5] == '{"null":null}' - assert data[6] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' - - -def test_load_json_in_array(session): - sql = """ - SELECT - 0 AS id, - [ - JSON_OBJECT('boolean', True), - JSON_OBJECT('int', 100), - JSON_OBJECT('float', 0.98), - JSON_OBJECT('string', 'hello world'), - JSON_OBJECT('array', [8, 9, 10]), - JSON_OBJECT('null', null), - JSON_OBJECT( - 'dict', - JSON_OBJECT( - 'int', 1, - 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] - ) - ) - ] AS array_col, - """ - df = session.read_gbq(sql, index_col="id") - - assert isinstance(df.dtypes["array_col"], pd.ArrowDtype) - assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType) - - data = df["array_col"].list - assert data.len()[0] == 7 - assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - - assert data[0][0] == '{"boolean":true}' - assert data[1][0] == '{"int":100}' - assert data[2][0] == '{"float":0.98}' - assert data[3][0] == '{"string":"hello world"}' - assert data[4][0] == '{"array":[8,9,10]}' - assert data[5][0] == '{"null":null}' - assert data[6][0] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' - - def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): """Verify to_pandas_batches() APIs returns the expected dtypes.""" expected = scalars_df_default_index.dtypes diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 663e5e2f10..e286c40450 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -22,6 +22,7 @@ import warnings import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq +import db_dtypes # type:ignore import google import google.cloud.bigquery as bigquery import numpy as np @@ -603,6 +604,154 @@ def test_read_gbq_external_table(session: bigframes.Session): assert df["i1"].max() == 99 +def test_read_gbq_w_json(session): + sql = """ + SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, + UNION ALL + SELECT 1, JSON_OBJECT('int', 100), + UNION ALL + SELECT 2, JSON_OBJECT('float', 0.98), + UNION ALL + SELECT 3, JSON_OBJECT('string', 'hello world'), + UNION ALL + SELECT 4, JSON_OBJECT('array', [8, 9, 10]), + UNION ALL + SELECT 5, JSON_OBJECT('null', null), + UNION ALL + SELECT 6, JSON_OBJECT('b', 2, 'a', 1), + UNION ALL + SELECT + 7, + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] + ) + ), + """ + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + df = session.read_gbq(sql, index_col="id").to_pandas(allow_large_results=True) + + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + assert df["json_col"][0] == '{"boolean":true}' + assert df["json_col"][1] == '{"int":100}' + assert df["json_col"][2] == '{"float":0.98}' + assert df["json_col"][3] == '{"string":"hello world"}' + assert df["json_col"][4] == '{"array":[8,9,10]}' + assert df["json_col"][5] == '{"null":null}' + + # Verifies JSON strings preserve array order, regardless of dictionary key order. + assert df["json_col"][6] == '{"a":1,"b":2}' + assert df["json_col"][7] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' + + +def test_read_gbq_w_json_and_compare_w_pandas_json(session): + df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + result = df.to_pandas(allow_large_results=True) + + # These JSON strings are compatible with BigQuery's JSON storage, + pd_df = pd.DataFrame( + {"json_col": ['{"bar":true,"foo":10}']}, + dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), + ) + pd_df.index = pd_df.index.astype("Int64") + pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) + pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) + + +def test_read_gbq_w_json_in_struct(session): + """Avoid regressions for internal issue 381148539.""" + sql = """ + SELECT 0 AS id, STRUCT(JSON_OBJECT('boolean', True) AS data, 1 AS number) AS struct_col + UNION ALL + SELECT 1, STRUCT(JSON_OBJECT('int', 100), 2), + UNION ALL + SELECT 2, STRUCT(JSON_OBJECT('float', 0.98), 3), + UNION ALL + SELECT 3, STRUCT(JSON_OBJECT('string', 'hello world'), 4), + UNION ALL + SELECT 4, STRUCT(JSON_OBJECT('array', [8, 9, 10]), 5), + UNION ALL + SELECT 5, STRUCT(JSON_OBJECT('null', null), 6), + UNION ALL + SELECT + 6, + STRUCT(JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] + ) + ), 7), + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) + + data = df["struct_col"].struct.field("data") + assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + data = data.to_pandas(allow_large_results=True) + + assert data[0] == '{"boolean":true}' + assert data[1] == '{"int":100}' + assert data[2] == '{"float":0.98}' + assert data[3] == '{"string":"hello world"}' + assert data[4] == '{"array":[8,9,10]}' + assert data[5] == '{"null":null}' + assert data[6] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' + + +def test_read_gbq_w_json_in_array(session): + sql = """ + SELECT + 0 AS id, + [ + JSON_OBJECT('boolean', True), + JSON_OBJECT('int', 100), + JSON_OBJECT('float', 0.98), + JSON_OBJECT('string', 'hello world'), + JSON_OBJECT('array', [8, 9, 10]), + JSON_OBJECT('null', null), + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ) + ] AS array_col, + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["array_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType) + + data = df["array_col"] + assert data.list.len()[0] == 7 + assert data.list[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) + + # TODO(b/401630655): JSON is not compatible with allow_large_results=False + pd_data = data.to_pandas(allow_large_results=True) + + assert pd_data[0] == [ + '{"boolean":true}', + '{"int":100}', + '{"float":0.98}', + '{"string":"hello world"}', + '{"array":[8,9,10]}', + '{"null":null}', + '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}', + ] + + def test_read_gbq_model(session, penguins_linear_model_name): model = session.read_gbq_model(penguins_linear_model_name) assert isinstance(model, bigframes.ml.linear_model.LinearRegression) diff --git a/tests/unit/bigquery/__init__.py b/tests/unit/bigquery/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/unit/bigquery/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/bigquery/test_json.py b/tests/unit/bigquery/test_json.py new file mode 100644 index 0000000000..d9beea26db --- /dev/null +++ b/tests/unit/bigquery/test_json.py @@ -0,0 +1,26 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest.mock as mock + +import pytest + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_json_set_w_invalid_json_path_value_pairs(): + mock_series = mock.create_autospec(bpd.pandas.Series, instance=True) + with pytest.raises(ValueError, match="Incorrect format"): + bbq.json_set(mock_series, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore