From 4d3e4a30ef73df90c8d047843656c2abb6b2ac35 Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 3 Mar 2025 22:06:19 +0000 Subject: [PATCH 01/15] feat: add Linear_Regression.global_explain() --- bigframes/ml/core.py | 11 +++++ bigframes/ml/linear_model.py | 45 +++++++++++++++++++ bigframes/ml/sql.py | 8 ++++ .../linear_regression_tutorial_test.py | 4 ++ 4 files changed, 68 insertions(+) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index ad00ed3f2c..682fb6fcd7 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -134,6 +134,17 @@ def explain_predict( ), ) + def global_explain( + self, input_data: bpd.DataFrame, options: Mapping[str, bool] + ) -> bpd.DataFrame: + return self._apply_ml_tvf( + input_data, + lambda source_sql: self._model_manipulation_sql_generator.ml_global_explain( + source_sql=source_sql, + struct_options=options, + ), + ) + def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( input_data, diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 46c5744a42..085ad89437 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -203,6 +203,51 @@ def predict_explain( X, options={"top_k_features": top_k_features} ) + def global_explain( + self, + X: utils.ArrayType, + *, + class_level_explain: bool = False, + ) -> bpd.DataFrame: + """ + Provide explanations for an entire linear regression model. + + .. note:: + Output matches that of the BigQuery ML.GLOBAL_PREDICT function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-global-explain + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or + pandas.core.frame.DataFrame or pandas.core.series.Series): + Series or a DataFrame to explain its predictions. + class_level_explain (bool, default False): + a BOOL value that specifies whether global feature importances + are returned for each class. Applies only to non-AutoML Tables + classification models. When set to FALSE, the global feature + importance of the entire model is returned rather than that of + each class. The default value is FALSE. + + Regression models and AutoML Tables classification models only + have model-level global feature importance. + + Returns: + bigframes.pandas.DataFrame: + The predicted DataFrames with feature and attribution columns. + """ + if class_level_explain is not True or False: + raise ValueError( + f"`class_level_explain` must be set to `True` or `False` but is currently {class_level_explain}" + ) + + if not self._bqml_model: + raise RuntimeError("A model must be fitted before predict") + + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + + return self._bqml_model.global_explain( + X, options={"class_level_explain": class_level_explain} + ) + def score( self, X: utils.ArrayType, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index b662d4c22c..570c65d46c 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -312,6 +312,14 @@ def ml_explain_predict( return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()}, ({source_sql}), {struct_options_sql})""" + def ml_global_explain( + self, source_sql: str, struct_options: Mapping[str, bool] + ) -> str: + """Encode ML.GLOBAL_EXPLAIN for BQML""" + struct_options_sql = self.struct_options(**struct_options) + return f"""SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {self._model_ref_sql()}, + ({source_sql}), {struct_options_sql})""" + def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" struct_options_sql = self.struct_options(**struct_options) diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index e4ace53a5c..03bce4ad93 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -92,6 +92,10 @@ def test_linear_regression(random_model_id: str) -> None: # 3 5349.603734 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 5349.603734 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.4 15.6 221.0 5000.0 MALE # 4 4637.165037 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 4637.165037 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 13.2 211.0 4500.0 FEMALE # [END bigquery_dataframes_bqml_linear_predict_explain] + # [START bigquery_dataframes_bqml_linear_global_explain] + explain_model = model.global_explain(biscoe_data, class_level_explain=True) + # [END bigquery_dataframes_bqml_linear_global_explain] + assert explain_model is not None assert feature_columns is not None assert label_columns is not None assert model is not None From 87db2b72b505eb5729f7a6fd5c7f92e3c4b15877 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 4 Mar 2025 21:52:31 +0000 Subject: [PATCH 02/15] remove class_level_explain param --- bigframes/ml/core.py | 10 ++------ bigframes/ml/linear_model.py | 24 ++----------------- bigframes/ml/sql.py | 7 ++---- .../linear_regression_tutorial_test.py | 2 +- 4 files changed, 7 insertions(+), 36 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 682fb6fcd7..cc61554c29 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -134,15 +134,9 @@ def explain_predict( ), ) - def global_explain( - self, input_data: bpd.DataFrame, options: Mapping[str, bool] - ) -> bpd.DataFrame: + def global_explain(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( - input_data, - lambda source_sql: self._model_manipulation_sql_generator.ml_global_explain( - source_sql=source_sql, - struct_options=options, - ), + input_data, self._model_manipulation_sql_generator.ml_global_explain ) def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 085ad89437..bcf92f8a45 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -203,12 +203,7 @@ def predict_explain( X, options={"top_k_features": top_k_features} ) - def global_explain( - self, - X: utils.ArrayType, - *, - class_level_explain: bool = False, - ) -> bpd.DataFrame: + def global_explain(self, X: utils.ArrayType) -> bpd.DataFrame: """ Provide explanations for an entire linear regression model. @@ -220,33 +215,18 @@ def global_explain( X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to explain its predictions. - class_level_explain (bool, default False): - a BOOL value that specifies whether global feature importances - are returned for each class. Applies only to non-AutoML Tables - classification models. When set to FALSE, the global feature - importance of the entire model is returned rather than that of - each class. The default value is FALSE. - - Regression models and AutoML Tables classification models only - have model-level global feature importance. Returns: bigframes.pandas.DataFrame: The predicted DataFrames with feature and attribution columns. """ - if class_level_explain is not True or False: - raise ValueError( - f"`class_level_explain` must be set to `True` or `False` but is currently {class_level_explain}" - ) if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - return self._bqml_model.global_explain( - X, options={"class_level_explain": class_level_explain} - ) + return self._bqml_model.global_explain(X) def score( self, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 570c65d46c..a750d22173 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -312,13 +312,10 @@ def ml_explain_predict( return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()}, ({source_sql}), {struct_options_sql})""" - def ml_global_explain( - self, source_sql: str, struct_options: Mapping[str, bool] - ) -> str: + def ml_global_explain(self, source_sql: str) -> str: """Encode ML.GLOBAL_EXPLAIN for BQML""" - struct_options_sql = self.struct_options(**struct_options) return f"""SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {self._model_ref_sql()}, - ({source_sql}), {struct_options_sql})""" + ({source_sql}))""" def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index 03bce4ad93..9e21c33046 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -93,7 +93,7 @@ def test_linear_regression(random_model_id: str) -> None: # 4 4637.165037 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 4637.165037 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 13.2 211.0 4500.0 FEMALE # [END bigquery_dataframes_bqml_linear_predict_explain] # [START bigquery_dataframes_bqml_linear_global_explain] - explain_model = model.global_explain(biscoe_data, class_level_explain=True) + explain_model = model.global_explain(label_columns["body_mass_g"]) # [END bigquery_dataframes_bqml_linear_global_explain] assert explain_model is not None assert feature_columns is not None From 82a234a8e99b33cd547c54ff82263d0ff5a46d09 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 11 Mar 2025 19:42:00 +0000 Subject: [PATCH 03/15] working global_explain() --- bigframes/ml/core.py | 9 ++++++--- bigframes/ml/linear_model.py | 8 ++++---- bigframes/ml/sql.py | 5 +++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index cc61554c29..5cfb457eb2 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -134,10 +134,13 @@ def explain_predict( ), ) - def global_explain(self, input_data: bpd.DataFrame) -> bpd.DataFrame: - return self._apply_ml_tvf( - input_data, self._model_manipulation_sql_generator.ml_global_explain + def global_explain(self, options: Mapping[str, bool]) -> bpd.DataFrame: + sql = self._model_manipulation_sql_generator.ml_global_explain( + struct_options=options ) + return self._session.read_gbq( + sql, + ).reset_index() def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index bcf92f8a45..984b333d1c 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -203,7 +203,9 @@ def predict_explain( X, options={"top_k_features": top_k_features} ) - def global_explain(self, X: utils.ArrayType) -> bpd.DataFrame: + def global_explain( + self, + ) -> bpd.DataFrame: """ Provide explanations for an entire linear regression model. @@ -224,9 +226,7 @@ def global_explain(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) - - return self._bqml_model.global_explain(X) + return self._bqml_model.global_explain({}) def score( self, diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index a750d22173..e89f17bcaa 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -312,10 +312,11 @@ def ml_explain_predict( return f"""SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {self._model_ref_sql()}, ({source_sql}), {struct_options_sql})""" - def ml_global_explain(self, source_sql: str) -> str: + def ml_global_explain(self, struct_options) -> str: """Encode ML.GLOBAL_EXPLAIN for BQML""" + struct_options_sql = self.struct_options(**struct_options) return f"""SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {self._model_ref_sql()}, - ({source_sql}))""" + {struct_options_sql})""" def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" From ed73f88066f0b260496ba63a8b4a928acf632f47 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 11 Mar 2025 20:44:21 +0000 Subject: [PATCH 04/15] begin adding tests --- tests/system/small/ml/conftest.py | 9 +++++++++ tests/system/small/ml/test_linear_model.py | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index 0e8489c513..dd3fbbf37f 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -84,6 +84,15 @@ def ephemera_penguins_linear_model( return bf_model +@pytest.fixture(scope="function") +def global_penguins_linear_model( + penguins_bqml_linear_model: core.BqmlModel, +) -> linear_model.LinearRegression: + bf_model = linear_model.LinearRegression(enable_global_explain=True) + bf_model._bqml_model = penguins_bqml_linear_model + return bf_model + + @pytest.fixture(scope="session") def penguins_logistic_model( session, penguins_logistic_model_name diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index da9fc8e14f..c51935b7eb 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -228,6 +228,27 @@ def test_to_gbq_saved_linear_reg_model_scores( ) +def test_linear_reg_model_global_explain(global_penguins_linear_model, new_penguins_df): + training_data = new_penguins_df.dropna(subset=["body_mass_g"]) + X = training_data.drop(columns=["body_mass_g"]) + y = training_data[["body_mass_g"]] + global_penguins_linear_model.fit(X, y) + global_ex = global_penguins_linear_model.global_explain() + assert global_ex.shape == (6, 3) + # result = predictions[["predicted_body_mass_g"]] + # expected = pandas.DataFrame( + # {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, + # dtype="Float64", + # index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + # ) + # pandas.testing.assert_frame_equal( + # result.sort_index(), + # expected, + # check_exact=False, + # rtol=0.1, + # ) + + def test_to_gbq_replace(penguins_linear_model, table_id_unique): penguins_linear_model.to_gbq(table_id_unique, replace=True) with pytest.raises(google.api_core.exceptions.Conflict): From 47b98627527099a8bcd02a1e783eea8b74bdc531 Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Mar 2025 12:59:41 +0000 Subject: [PATCH 05/15] update snippet --- samples/snippets/linear_regression_tutorial_test.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index 9e21c33046..501dc8f446 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -93,7 +93,13 @@ def test_linear_regression(random_model_id: str) -> None: # 4 4637.165037 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 4637.165037 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 13.2 211.0 4500.0 FEMALE # [END bigquery_dataframes_bqml_linear_predict_explain] # [START bigquery_dataframes_bqml_linear_global_explain] - explain_model = model.global_explain(label_columns["body_mass_g"]) + model = LinearRegression(enable_global_explain=True) + training_data = bq_df.dropna(subset=["body_mass_g"]) + X = training_data.drop(columns=["body_mass_g"]) + y = training_data[["body_mass_g"]] + model.fit(X, y) + model.to_gbq("bqml_tutorial.penguins_model", replace=True) + explain_model = model.global_explain() # [END bigquery_dataframes_bqml_linear_global_explain] assert explain_model is not None assert feature_columns is not None From 7046dc3fc7b90bfeeae3cde42a10bc00f217ec3e Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Mar 2025 21:14:30 +0000 Subject: [PATCH 06/15] complete snippet --- .../snippets/linear_regression_tutorial_test.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index 501dc8f446..4cc385e97e 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -93,13 +93,28 @@ def test_linear_regression(random_model_id: str) -> None: # 4 4637.165037 [{'feature': 'island', 'attribution': 7348.877... -5320.222128 4637.165037 0.0 Gentoo penguin (Pygoscelis papua) Biscoe 46.1 13.2 211.0 4500.0 FEMALE # [END bigquery_dataframes_bqml_linear_predict_explain] # [START bigquery_dataframes_bqml_linear_global_explain] + # To use the `global_explain()` function, the model must be recreated with `enable_global_explain` set to `True`. model = LinearRegression(enable_global_explain=True) + + # The model must the be fitted before it can be saved to BigQuery and then explained. training_data = bq_df.dropna(subset=["body_mass_g"]) X = training_data.drop(columns=["body_mass_g"]) y = training_data[["body_mass_g"]] model.fit(X, y) model.to_gbq("bqml_tutorial.penguins_model", replace=True) + + # Explain the model explain_model = model.global_explain() + + # Expected results: + # index feature attribution + # 0 0 flipper_length_mm 193.612051 + # 1 1 sex 5139.35423 + # 2 2 culmen_depth_mm 117.084944 + # 3 3 species 4259.554372 + # 4 4 island 7330.53279 + # 5 5 culmen_length_mm 94.366793 + # [END bigquery_dataframes_bqml_linear_global_explain] assert explain_model is not None assert feature_columns is not None From b0b9552c63b49d734062b4af7f49c267be2991d7 Mon Sep 17 00:00:00 2001 From: Daniela Date: Wed, 12 Mar 2025 22:54:40 +0000 Subject: [PATCH 07/15] failing, near complete linear model test --- tests/system/small/ml/test_linear_model.py | 34 ++++++++++++++-------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index c51935b7eb..dea1a38c74 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -235,18 +235,28 @@ def test_linear_reg_model_global_explain(global_penguins_linear_model, new_pengu global_penguins_linear_model.fit(X, y) global_ex = global_penguins_linear_model.global_explain() assert global_ex.shape == (6, 3) - # result = predictions[["predicted_body_mass_g"]] - # expected = pandas.DataFrame( - # {"predicted_body_mass_g": [4030.1, 3280.8, 3177.9]}, - # dtype="Float64", - # index=pandas.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - # ) - # pandas.testing.assert_frame_equal( - # result.sort_index(), - # expected, - # check_exact=False, - # rtol=0.1, - # ) + global_columns = set(global_ex.columns) + expected_columns = {"index", "feature", "attribution"} + assert expected_columns <= global_columns + result = global_ex["attribution"].to_pandas() + expected = pandas.DataFrame( + { + "attribution": [ + 193.612051, + 5139.35423, + 117.084944, + 4259.554372, + 7330.53279, + 94.366793, + ] + }, + dtype="Float64", + ) + pandas.testing.assert_frame_equal( + result, + expected, + check_exact=False, + ) def test_to_gbq_replace(penguins_linear_model, table_id_unique): From 1ad520833c1daf9ab4234c8b29f3242031d9a6cf Mon Sep 17 00:00:00 2001 From: Daniela Date: Fri, 14 Mar 2025 17:34:11 +0000 Subject: [PATCH 08/15] passing system test --- tests/system/small/ml/test_linear_model.py | 41 +++++++++++++--------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index dea1a38c74..a3add631d4 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -235,27 +235,34 @@ def test_linear_reg_model_global_explain(global_penguins_linear_model, new_pengu global_penguins_linear_model.fit(X, y) global_ex = global_penguins_linear_model.global_explain() assert global_ex.shape == (6, 3) - global_columns = set(global_ex.columns) - expected_columns = {"index", "feature", "attribution"} - assert expected_columns <= global_columns - result = global_ex["attribution"].to_pandas() - expected = pandas.DataFrame( - { - "attribution": [ - 193.612051, - 5139.35423, - 117.084944, - 4259.554372, - 7330.53279, - 94.366793, - ] - }, - dtype="Float64", + expected_columns = pandas.Index(["index", "feature", "attribution"]) + pandas.testing.assert_index_equal(global_ex.columns, expected_columns) + result = global_ex[["feature"]].to_pandas().set_index("feature").sort_index() + features = pandas.Series( + [ + "flipper_length_mm", + "species", + "sex", + "culmen_depth_mm", + "culmen_length_mm", + "island", + ], + dtype=pandas.StringDtype(storage="pyarrow"), + ) + expected_feature = ( + pandas.DataFrame( + { + "feature": features, + } + ) + .set_index("feature") + .sort_index() ) pandas.testing.assert_frame_equal( result, - expected, + expected_feature, check_exact=False, + check_index_type=False, ) From a600539353bc3a60d6b3b8d6fba39e767cee15b0 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 17 Mar 2025 12:30:09 -0500 Subject: [PATCH 09/15] Update core.py - set index to have sorted by feature --- bigframes/ml/core.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 5cfb457eb2..fd77f09282 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -138,9 +138,7 @@ def global_explain(self, options: Mapping[str, bool]) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_global_explain( struct_options=options ) - return self._session.read_gbq( - sql, - ).reset_index() + return self._session.read_gbq(sql).sort_values(by='attribution', ascending=False).set_index("feature") def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( From 7fc0cc6d29c99a1d42ff5634f3730687c4791240 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 17 Mar 2025 15:13:04 -0500 Subject: [PATCH 10/15] Update test_linear_model.py - remove set/set index --- tests/system/small/ml/test_linear_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index a3add631d4..ba0968c9bd 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -237,7 +237,7 @@ def test_linear_reg_model_global_explain(global_penguins_linear_model, new_pengu assert global_ex.shape == (6, 3) expected_columns = pandas.Index(["index", "feature", "attribution"]) pandas.testing.assert_index_equal(global_ex.columns, expected_columns) - result = global_ex[["feature"]].to_pandas().set_index("feature").sort_index() + result = global_ex[["feature"]].to_pandas() features = pandas.Series( [ "flipper_length_mm", @@ -255,8 +255,6 @@ def test_linear_reg_model_global_explain(global_penguins_linear_model, new_pengu "feature": features, } ) - .set_index("feature") - .sort_index() ) pandas.testing.assert_frame_equal( result, From 57c3d4a0e4fa854e9883be2c4e177f9f74ac58ba Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 17 Mar 2025 15:42:44 -0500 Subject: [PATCH 11/15] Update linear_model.py - fix doc section --- bigframes/ml/linear_model.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 984b333d1c..ae9fd8db13 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -210,14 +210,9 @@ def global_explain( Provide explanations for an entire linear regression model. .. note:: - Output matches that of the BigQuery ML.GLOBAL_PREDICT function. + Output matches that of the BigQuery ML.GLOBAL_EXPLAIN function. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-global-explain - Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or - pandas.core.frame.DataFrame or pandas.core.series.Series): - Series or a DataFrame to explain its predictions. - Returns: bigframes.pandas.DataFrame: The predicted DataFrames with feature and attribution columns. From c2c08377ede5dba1e10c2a8d2b8e1085fd6b3871 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 17 Mar 2025 15:44:23 -0500 Subject: [PATCH 12/15] Update conftest.py - rename penguins w global explain --- tests/system/small/ml/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index dd3fbbf37f..2b9392f523 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -85,7 +85,7 @@ def ephemera_penguins_linear_model( @pytest.fixture(scope="function") -def global_penguins_linear_model( +def penguins_linear_model_w_global_explain( penguins_bqml_linear_model: core.BqmlModel, ) -> linear_model.LinearRegression: bf_model = linear_model.LinearRegression(enable_global_explain=True) From 3a0c6b97c0e622b52d8787f604e60ac688810dbd Mon Sep 17 00:00:00 2001 From: rey-esp Date: Mon, 17 Mar 2025 16:39:01 -0500 Subject: [PATCH 13/15] Update linear_model.py - complete doc --- bigframes/ml/linear_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index ae9fd8db13..3774a62c0c 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -215,7 +215,7 @@ def global_explain( Returns: bigframes.pandas.DataFrame: - The predicted DataFrames with feature and attribution columns. + Dataframes containing feature importance values and corresponding attributions, designed to provide a global explanation of feature influence. """ if not self._bqml_model: From 5dac41d21464d37571ad40f16c437f27cd5c2f6b Mon Sep 17 00:00:00 2001 From: Daniela Date: Mon, 17 Mar 2025 21:49:01 +0000 Subject: [PATCH 14/15] lint --- bigframes/ml/core.py | 6 +++++- tests/system/small/ml/test_linear_model.py | 10 ++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index fd77f09282..01917fd6d8 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -138,7 +138,11 @@ def global_explain(self, options: Mapping[str, bool]) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_global_explain( struct_options=options ) - return self._session.read_gbq(sql).sort_values(by='attribution', ascending=False).set_index("feature") + return ( + self._session.read_gbq(sql) + .sort_values(by="attribution", ascending=False) + .set_index("feature") + ) def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: return self._apply_ml_tvf( diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index ba0968c9bd..efb4c3c807 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -249,12 +249,10 @@ def test_linear_reg_model_global_explain(global_penguins_linear_model, new_pengu ], dtype=pandas.StringDtype(storage="pyarrow"), ) - expected_feature = ( - pandas.DataFrame( - { - "feature": features, - } - ) + expected_feature = pandas.DataFrame( + { + "feature": features, + } ) pandas.testing.assert_frame_equal( result, From e5f4aad904106841cf9b72a25d10d4aa105e7d22 Mon Sep 17 00:00:00 2001 From: Daniela Date: Tue, 18 Mar 2025 15:20:13 +0000 Subject: [PATCH 15/15] passing test and fixed expected results --- .../linear_regression_tutorial_test.py | 16 +++---- tests/system/small/ml/test_linear_model.py | 44 ++++++++++--------- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index 4cc385e97e..8fc1c5ad61 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -107,14 +107,14 @@ def test_linear_regression(random_model_id: str) -> None: explain_model = model.global_explain() # Expected results: - # index feature attribution - # 0 0 flipper_length_mm 193.612051 - # 1 1 sex 5139.35423 - # 2 2 culmen_depth_mm 117.084944 - # 3 3 species 4259.554372 - # 4 4 island 7330.53279 - # 5 5 culmen_length_mm 94.366793 - + # attribution + # feature + # island 5737.315921 + # species 4073.280549 + # sex 622.070896 + # flipper_length_mm 193.612051 + # culmen_depth_mm 117.084944 + # culmen_length_mm 94.366793 # [END bigquery_dataframes_bqml_linear_global_explain] assert explain_model is not None assert feature_columns is not None diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index efb4c3c807..8b04d55e61 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -228,31 +228,33 @@ def test_to_gbq_saved_linear_reg_model_scores( ) -def test_linear_reg_model_global_explain(global_penguins_linear_model, new_penguins_df): +def test_linear_reg_model_global_explain( + penguins_linear_model_w_global_explain, new_penguins_df +): training_data = new_penguins_df.dropna(subset=["body_mass_g"]) X = training_data.drop(columns=["body_mass_g"]) y = training_data[["body_mass_g"]] - global_penguins_linear_model.fit(X, y) - global_ex = global_penguins_linear_model.global_explain() - assert global_ex.shape == (6, 3) - expected_columns = pandas.Index(["index", "feature", "attribution"]) + penguins_linear_model_w_global_explain.fit(X, y) + global_ex = penguins_linear_model_w_global_explain.global_explain() + assert global_ex.shape == (6, 1) + expected_columns = pandas.Index(["attribution"]) pandas.testing.assert_index_equal(global_ex.columns, expected_columns) - result = global_ex[["feature"]].to_pandas() - features = pandas.Series( - [ - "flipper_length_mm", - "species", - "sex", - "culmen_depth_mm", - "culmen_length_mm", - "island", - ], - dtype=pandas.StringDtype(storage="pyarrow"), - ) - expected_feature = pandas.DataFrame( - { - "feature": features, - } + result = global_ex.to_pandas().drop(["attribution"], axis=1).sort_index() + expected_feature = ( + pandas.DataFrame( + { + "feature": [ + "island", + "species", + "sex", + "flipper_length_mm", + "culmen_depth_mm", + "culmen_length_mm", + ] + }, + ) + .set_index("feature") + .sort_index() ) pandas.testing.assert_frame_equal( result,