From 78f3576aaa94c5b643c3c6bcdb5922789923c758 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sun, 22 Oct 2023 23:36:57 -0700 Subject: [PATCH 1/8] Fix breaking kwarg --- modelbuilders_bench/lgbm_mb.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index f263d419c..7ddc6a68c 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -118,8 +118,7 @@ t_train, model_lgbm = bench.measure_function_time(lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators, - valid_sets=lgbm_train, - verbose_eval=False) + valid_sets=lgbm_train) train_metric = None if not X_train.equals(X_test): y_train_pred = model_lgbm.predict(X_train) From b3b3a538ad6e49b7221aca15ff49456de59f16df Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sun, 22 Oct 2023 23:37:13 -0700 Subject: [PATCH 2/8] Add SHAP calculation measurements --- modelbuilders_bench/xgb_mb.py | 391 +++++++++++++++++++++++----------- 1 file changed, 269 insertions(+), 122 deletions(-) diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 75da615b8..b3199901e 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -1,5 +1,5 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation +# ============================================================================== +# Copyright 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# =============================================================================== +# ============================================================================== import argparse @@ -27,120 +27,189 @@ def convert_probs_to_classes(y_prob): def convert_xgb_predictions(y_pred, objective): - if objective == 'multi:softprob': + if objective == "multi:softprob": y_pred = convert_probs_to_classes(y_pred) - elif objective == 'binary:logistic': + elif objective == "binary:logistic": y_pred = (y_pred >= 0.5).astype(np.int32) return y_pred +def shap_accuracy(new, ref, threshold=1e-5): + new_sh = new.reshape(-1, ) + ref_sh = ref.reshape(-1, ) + diff = np.abs(new_sh - ref_sh) + return (diff < threshold).sum() / float(len(ref_sh)) + + parser = argparse.ArgumentParser( - description='xgboost gbt + model transform + daal predict benchmark') - -parser.add_argument('--colsample-bytree', type=float, default=1, - help='Subsample ratio of columns ' - 'when constructing each tree') -parser.add_argument('--count-dmatrix', default=False, action='store_true', - help='Count DMatrix creation in time measurements') -parser.add_argument('--enable-experimental-json-serialization', default=True, - choices=('True', 'False'), help='Use JSON to store memory snapshots') -parser.add_argument('--grow-policy', type=str, default='depthwise', - help='Controls a way new nodes are added to the tree') -parser.add_argument('--inplace-predict', default=False, action='store_true', - help='Perform inplace_predict instead of default') -parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, - help='Step size shrinkage used in update ' - 'to prevents overfitting') -parser.add_argument('--max-bin', type=int, default=256, - help='Maximum number of discrete bins to ' - 'bucket continuous features') -parser.add_argument('--max-delta-step', type=float, default=0, - help='Maximum delta step we allow each leaf output to be') -parser.add_argument('--max-depth', type=int, default=6, - help='Maximum depth of a tree') -parser.add_argument('--max-leaves', type=int, default=0, - help='Maximum number of nodes to be added') -parser.add_argument('--min-child-weight', type=float, default=1, - help='Minimum sum of instance weight needed in a child') -parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, - help='Minimum loss reduction required to make' - ' partition on a leaf node') -parser.add_argument('--n-estimators', type=int, default=100, - help='Number of gradient boosted trees') -parser.add_argument('--objective', type=str, required=True, - choices=('reg:squarederror', 'binary:logistic', - 'multi:softmax', 'multi:softprob'), - help='Control a balance of positive and negative weights') -parser.add_argument('--reg-alpha', type=float, default=0, - help='L1 regularization term on weights') -parser.add_argument('--reg-lambda', type=float, default=1, - help='L2 regularization term on weights') -parser.add_argument('--scale-pos-weight', type=float, default=1, - help='Controls a balance of positive and negative weights') -parser.add_argument('--single-precision-histogram', default=False, action='store_true', - help='Build histograms instead of double precision') -parser.add_argument('--subsample', type=float, default=1, - help='Subsample ratio of the training instances') -parser.add_argument('--tree-method', type=str, required=True, - help='The tree construction algorithm used in XGBoost') + description="xgboost gbt + model transform + daal predict benchmark" +) + +parser.add_argument( + "--colsample-bytree", + type=float, + default=1, + help="Subsample ratio of columns " "when constructing each tree", +) +parser.add_argument( + "--count-dmatrix", + default=False, + action="store_true", + help="Count DMatrix creation in time measurements", +) +parser.add_argument( + "--enable-experimental-json-serialization", + default=True, + choices=("True", "False"), + help="Use JSON to store memory snapshots", +) +parser.add_argument( + "--grow-policy", + type=str, + default="depthwise", + help="Controls a way new nodes are added to the tree", +) +parser.add_argument( + "--inplace-predict", + default=False, + action="store_true", + help="Perform inplace_predict instead of default", +) +parser.add_argument( + "--learning-rate", + "--eta", + type=float, + default=0.3, + help="Step size shrinkage used in update " "to prevents overfitting", +) +parser.add_argument( + "--max-bin", + type=int, + default=256, + help="Maximum number of discrete bins to " "bucket continuous features", +) +parser.add_argument( + "--max-delta-step", + type=float, + default=0, + help="Maximum delta step we allow each leaf output to be", +) +parser.add_argument("--max-depth", type=int, default=6, help="Maximum depth of a tree") +parser.add_argument( + "--max-leaves", type=int, default=0, help="Maximum number of nodes to be added" +) +parser.add_argument( + "--min-child-weight", + type=float, + default=1, + help="Minimum sum of instance weight needed in a child", +) +parser.add_argument( + "--min-split-loss", + "--gamma", + type=float, + default=0, + help="Minimum loss reduction required to make" " partition on a leaf node", +) +parser.add_argument( + "--n-estimators", type=int, default=100, help="Number of gradient boosted trees" +) +parser.add_argument( + "--objective", + type=str, + required=True, + choices=("reg:squarederror", "binary:logistic", "multi:softmax", "multi:softprob"), + help="Control a balance of positive and negative weights", +) +parser.add_argument( + "--reg-alpha", type=float, default=0, help="L1 regularization term on weights" +) +parser.add_argument( + "--reg-lambda", type=float, default=1, help="L2 regularization term on weights" +) +parser.add_argument( + "--scale-pos-weight", + type=float, + default=1, + help="Controls a balance of positive and negative weights", +) +parser.add_argument( + "--single-precision-histogram", + default=False, + action="store_true", + help="Build histograms instead of double precision", +) +parser.add_argument( + "--subsample", + type=float, + default=1, + help="Subsample ratio of the training instances", +) +parser.add_argument( + "--tree-method", + type=str, + required=True, + help="The tree construction algorithm used in XGBoost", +) params = bench.parse_args(parser) X_train, X_test, y_train, y_test = bench.load_data(params) xgb_params = { - 'booster': 'gbtree', - 'verbosity': 0, - 'learning_rate': params.learning_rate, - 'min_split_loss': params.min_split_loss, - 'max_depth': params.max_depth, - 'min_child_weight': params.min_child_weight, - 'max_delta_step': params.max_delta_step, - 'subsample': params.subsample, - 'sampling_method': 'uniform', - 'colsample_bytree': params.colsample_bytree, - 'colsample_bylevel': 1, - 'colsample_bynode': 1, - 'reg_lambda': params.reg_lambda, - 'reg_alpha': params.reg_alpha, - 'tree_method': params.tree_method, - 'scale_pos_weight': params.scale_pos_weight, - 'grow_policy': params.grow_policy, - 'max_leaves': params.max_leaves, - 'max_bin': params.max_bin, - 'objective': params.objective, - 'seed': params.seed, - 'single_precision_histogram': params.single_precision_histogram, - 'enable_experimental_json_serialization': - params.enable_experimental_json_serialization + "booster": "gbtree", + "verbosity": 0, + "learning_rate": params.learning_rate, + "min_split_loss": params.min_split_loss, + "max_depth": params.max_depth, + "min_child_weight": params.min_child_weight, + "max_delta_step": params.max_delta_step, + "subsample": params.subsample, + "sampling_method": "uniform", + "colsample_bytree": params.colsample_bytree, + "colsample_bylevel": 1, + "colsample_bynode": 1, + "reg_lambda": params.reg_lambda, + "reg_alpha": params.reg_alpha, + "tree_method": params.tree_method, + "scale_pos_weight": params.scale_pos_weight, + "grow_policy": params.grow_policy, + "max_leaves": params.max_leaves, + "max_bin": params.max_bin, + "objective": params.objective, + "seed": params.seed, + "single_precision_histogram": params.single_precision_histogram, + "enable_experimental_json_serialization": params.enable_experimental_json_serialization, } if params.threads != -1: - xgb_params.update({'nthread': params.threads}) + xgb_params.update({"nthread": params.threads}) -if params.objective.startswith('reg'): - task = 'regression' - metric_name, metric_func = 'rmse', bench.rmse_score +if params.objective.startswith("reg"): + task = "regression" + metric_name, metric_func = "rmse", bench.rmse_score else: - task = 'classification' - metric_name = 'accuracy' + task = "classification" + metric_name = "accuracy" metric_func = bench.accuracy_score - if 'cudf' in str(type(y_train)): + if "cudf" in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: params.n_classes = len(np.unique(y_train)) # Covtype has one class more than there is in train - if params.dataset_name == 'covtype': + if params.dataset_name == "covtype": params.n_classes += 1 if params.n_classes > 2: - xgb_params['num_class'] = params.n_classes + xgb_params["num_class"] = params.n_classes -t_creat_train, dtrain = bench.measure_function_time(xgb.DMatrix, X_train, - params=params, label=y_train) +t_creat_train, dtrain = bench.measure_function_time( + xgb.DMatrix, X_train, params=params, label=y_train +) t_creat_test, dtest = bench.measure_function_time( - xgb.DMatrix, X_test, params=params, label=y_test) + xgb.DMatrix, X_test, params=params, label=y_test +) def fit(dmatrix): @@ -150,52 +219,130 @@ def fit(dmatrix): if params.inplace_predict: + def predict(*args): - return booster.inplace_predict(np.ascontiguousarray(X_test.values, - dtype=np.float32)) + return booster.inplace_predict( + np.ascontiguousarray(X_test.values, dtype=np.float32) + ) + else: - def predict(dmatrix): # type: ignore + + def predict(dmatrix, **kwargs): # type: ignore if dmatrix is None: dmatrix = xgb.DMatrix(X_test, y_test) - return booster.predict(dmatrix) + return booster.predict(dmatrix, **kwargs) fit_time, booster = bench.measure_function_time( - fit, None if params.count_dmatrix else dtrain, params=params) + fit, None if params.count_dmatrix else dtrain, params=params +) train_metric = metric_func( - convert_xgb_predictions( - booster.predict(dtrain), - params.objective), - y_train) + convert_xgb_predictions(booster.predict(dtrain), params.objective), y_train +) predict_time, y_pred = bench.measure_function_time( - predict, None if params.inplace_predict or params.count_dmatrix else dtest, params=params) + predict, + None if params.inplace_predict or params.count_dmatrix else dtest, + params=params, +) test_metric = metric_func(convert_xgb_predictions(y_pred, params.objective), y_test) +shap_contrib_time, shap_contribs = bench.measure_function_time( + predict, dtest, pred_contribs=True, params=params +) + +shap_interaction_time, shap_interactions = bench.measure_function_time( + predict, dtest, pred_interactions=True, params=params +) + transform_time, model_daal = bench.measure_function_time( - daal4py.get_gbt_model_from_xgboost, booster, params=params) - -if hasattr(params, 'n_classes'): - predict_algo = daal4py.gbt_classification_prediction( - nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float') - predict_time_daal, daal_pred = bench.measure_function_time( - predict_algo.compute, X_test, model_daal, params=params) - test_metric_daal = metric_func(y_test, daal_pred.prediction) -else: - predict_algo = daal4py.gbt_regression_prediction() - predict_time_daal, daal_pred = bench.measure_function_time( - predict_algo.compute, X_test, model_daal, params=params) - test_metric_daal = metric_func(y_test, daal_pred.prediction) + daal4py.mb.convert_model, booster, params=params +) + +predict_time_daal, daal_pred = bench.measure_function_time( + model_daal.predict, X_test, params=params +) +test_metric_daal = metric_func(y_test, daal_pred) + +shap_contrib_time_daal, daal_contribs = bench.measure_function_time( + model_daal.predict, X_test, pred_contribs=True, params=params +) + +shap_interaction_time_daal, daal_interactions = bench.measure_function_time( + model_daal.predict, X_test, pred_interactions=True, params=params +) + +contrib_accuracy = shap_accuracy(shap_contribs, daal_contribs) + +interaction_accuracy = shap_accuracy(shap_interactions, daal_interactions) + bench.print_output( - library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder', - stages=['training_preparation', 'training', 'prediction_preparation', 'prediction', - 'transformation', 'alternative_prediction'], + library="modelbuilders", + algorithm=f"xgboost_{task}_and_modelbuilder", + stages=[ + "training_preparation", + "training", + "prediction_preparation", + "prediction", + "transformation", + "alternative_prediction", + "shap_contrib_prediction", + "alternative_shap_contrib_prediction", + "shap_interaction_prediction", + "alternative_shap_interaction_prediction", + ], params=params, - functions=['xgb.dmatrix.train', 'xgb.train', 'xgb.dmatrix.test', 'xgb.predict', - 'daal4py.get_gbt_model_from_xgboost', 'daal4py.compute'], - times=[t_creat_train, fit_time, t_creat_test, predict_time, transform_time, - predict_time_daal], - metric_type=metric_name, - metrics=[None, train_metric, None, test_metric, None, test_metric_daal], - data=[X_train, X_train, X_test, X_test, X_test, X_test]) + functions=[ + "xgb.dmatrix.train", + "xgb.train", + "xgb.dmatrix.test", + "xgb.predict", + "daal4py.get_gbt_model_from_xgboost", + "daal4py.predict", + "xgb.predict(pred_contribs=True)", + "daal4py.predict(pred_contribs=True)", + "xgb.predict(pred_interactions=True)", + "daal4py.predict(pred_interactions=True)", + ], + times=[ + t_creat_train, + fit_time, + t_creat_test, + predict_time, + transform_time, + predict_time_daal, + shap_contrib_time, + shap_contrib_time_daal, + shap_interaction_time, + shap_interaction_time_daal, + ], + metric_type=[metric_name, "accuracy"], + metrics=[ + [ + None, + train_metric, + None, + test_metric, + None, + test_metric_daal, + None, + None, + None, + None, + ], + [ + None, + None, + None, + None, + None, + None, + None, + contrib_accuracy, + None, + interaction_accuracy, + ], + ], + data=[X_train] * 2 + [X_test] * 8, +) From ca03b57cbd1ecafc451246f023c427a5e5797bba Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Sun, 22 Oct 2023 23:46:16 -0700 Subject: [PATCH 3/8] provide lgmb_mb converter script to fix result files --- report_generator/fix-lgbm-mb-results.py | 93 +++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 report_generator/fix-lgbm-mb-results.py diff --git a/report_generator/fix-lgbm-mb-results.py b/report_generator/fix-lgbm-mb-results.py new file mode 100644 index 000000000..35b1689cf --- /dev/null +++ b/report_generator/fix-lgbm-mb-results.py @@ -0,0 +1,93 @@ +# ============================================================================== +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +Temporary solution to fix the .json result files created from lgbm_mb.py. +The result files are in an incompatible format for report_generator.py. +Attempts to produce xlsx reports fail and create empty files. + +After running this script on my-file.json, a new file my-file-fixed.json will be +produced, containing a JSON version of the results in a compatible format. + +Usage: + + python fix-lgbm-mb-results.py my-file.json [another-file.json ...] + + +Note: This is just a quick and dirty hack that does not fix the underlying + issue. Rather than changing this file (if something breaks again), the + original script lgbm_mb.py should be updated such that it produces valid + JSON dumps again. +""" + +from argparse import ArgumentParser +import json +from pathlib import Path + +def fix_file(fname: Path): + with open(fname) as fp: + data = json.load(fp) + + # copy all data (aux info etc) + fixed = {} + for key, val in data.items(): + fixed[key] = val + + # reset the results - we'll fix them + fixed["results"] = [] + + current_result = {} + for result in data["results"]: + if "algorithm" in result: + # found a new algo / measurement + current_result = result + continue + + if "stage" in result: + comb = current_result | result + if "device" not in comb: + comb["device"] = "none" + + if "time[s]" not in comb: + comb["time[s]"] = result.get("training_time") or result["prediction_time"] + + if "algorithm_parameters" not in comb: + comb["algorithm_paramters"] = {} + + if "accuracy[%]" in comb: + comb["accuracy"] = comb["accuracy[%]"] + + replace_pairs = ( + ("lgbm_train", "training"), + ("lgbm_predict", "prediction"), + ("daal4py_predict", "alternative_prediction"), + ) + for s, r in replace_pairs: + comb["stage"] = comb["stage"].replace(s, r) + + fixed["results"].append(comb) + + out_fname = fname.stem + "-fixed.json" + with open(out_fname, "w") as fp: + json.dump(fixed, fp, indent=4) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("filenames", nargs="+") + args = parser.parse_args() + for fname in args.filenames: + fix_file(Path(fname)) From f9d6257b46a0b9a7657c2ff9d7cc3cfe52512540 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 23 Oct 2023 01:02:48 -0700 Subject: [PATCH 4/8] Use RMSE for SHAP accuracy --- modelbuilders_bench/xgb_mb.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index b3199901e..749f9ee16 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -34,11 +34,9 @@ def convert_xgb_predictions(y_pred, objective): return y_pred -def shap_accuracy(new, ref, threshold=1e-5): - new_sh = new.reshape(-1, ) - ref_sh = ref.reshape(-1, ) - diff = np.abs(new_sh - ref_sh) - return (diff < threshold).sum() / float(len(ref_sh)) +def shap_accuracy(new, ref): + # broadcast all values into single column and calculate RMSE + return bench.rmse_score(new.reshape(-1, ), ref.reshape(-1, )) parser = argparse.ArgumentParser( @@ -317,7 +315,7 @@ def predict(dmatrix, **kwargs): # type: ignore shap_interaction_time, shap_interaction_time_daal, ], - metric_type=[metric_name, "accuracy"], + metric_type=[metric_name, "RMSE"], metrics=[ [ None, From 2e354927ad699602b6ad279d9a4773000e079ae9 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 27 Oct 2023 01:55:20 -0700 Subject: [PATCH 5/8] auto-format --- report_generator/fix-lgbm-mb-results.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/report_generator/fix-lgbm-mb-results.py b/report_generator/fix-lgbm-mb-results.py index 35b1689cf..c5dbc9702 100644 --- a/report_generator/fix-lgbm-mb-results.py +++ b/report_generator/fix-lgbm-mb-results.py @@ -37,6 +37,7 @@ import json from pathlib import Path + def fix_file(fname: Path): with open(fname) as fp: data = json.load(fp) @@ -62,7 +63,9 @@ def fix_file(fname: Path): comb["device"] = "none" if "time[s]" not in comb: - comb["time[s]"] = result.get("training_time") or result["prediction_time"] + comb["time[s]"] = ( + result.get("training_time") or result["prediction_time"] + ) if "algorithm_parameters" not in comb: comb["algorithm_paramters"] = {} From a16531741372187df32480d8c40323d88e4ee3ba Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 27 Oct 2023 03:19:49 -0700 Subject: [PATCH 6/8] Revert "Fix breaking kwarg" This reverts commit 78f3576aaa94c5b643c3c6bcdb5922789923c758. --- modelbuilders_bench/lgbm_mb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modelbuilders_bench/lgbm_mb.py b/modelbuilders_bench/lgbm_mb.py index 7ddc6a68c..f263d419c 100644 --- a/modelbuilders_bench/lgbm_mb.py +++ b/modelbuilders_bench/lgbm_mb.py @@ -118,7 +118,8 @@ t_train, model_lgbm = bench.measure_function_time(lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators, - valid_sets=lgbm_train) + valid_sets=lgbm_train, + verbose_eval=False) train_metric = None if not X_train.equals(X_test): y_train_pred = model_lgbm.predict(X_train) From 99b4e93580602da5aa3e6a13ac6cb9f55fce8008 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 27 Oct 2023 03:21:42 -0700 Subject: [PATCH 7/8] Revert "provide lgmb_mb converter script to fix result files" This reverts commit ca03b57cbd1ecafc451246f023c427a5e5797bba. --- report_generator/fix-lgbm-mb-results.py | 96 ------------------------- 1 file changed, 96 deletions(-) delete mode 100644 report_generator/fix-lgbm-mb-results.py diff --git a/report_generator/fix-lgbm-mb-results.py b/report_generator/fix-lgbm-mb-results.py deleted file mode 100644 index c5dbc9702..000000000 --- a/report_generator/fix-lgbm-mb-results.py +++ /dev/null @@ -1,96 +0,0 @@ -# ============================================================================== -# Copyright 2020-2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -""" -Temporary solution to fix the .json result files created from lgbm_mb.py. -The result files are in an incompatible format for report_generator.py. -Attempts to produce xlsx reports fail and create empty files. - -After running this script on my-file.json, a new file my-file-fixed.json will be -produced, containing a JSON version of the results in a compatible format. - -Usage: - - python fix-lgbm-mb-results.py my-file.json [another-file.json ...] - - -Note: This is just a quick and dirty hack that does not fix the underlying - issue. Rather than changing this file (if something breaks again), the - original script lgbm_mb.py should be updated such that it produces valid - JSON dumps again. -""" - -from argparse import ArgumentParser -import json -from pathlib import Path - - -def fix_file(fname: Path): - with open(fname) as fp: - data = json.load(fp) - - # copy all data (aux info etc) - fixed = {} - for key, val in data.items(): - fixed[key] = val - - # reset the results - we'll fix them - fixed["results"] = [] - - current_result = {} - for result in data["results"]: - if "algorithm" in result: - # found a new algo / measurement - current_result = result - continue - - if "stage" in result: - comb = current_result | result - if "device" not in comb: - comb["device"] = "none" - - if "time[s]" not in comb: - comb["time[s]"] = ( - result.get("training_time") or result["prediction_time"] - ) - - if "algorithm_parameters" not in comb: - comb["algorithm_paramters"] = {} - - if "accuracy[%]" in comb: - comb["accuracy"] = comb["accuracy[%]"] - - replace_pairs = ( - ("lgbm_train", "training"), - ("lgbm_predict", "prediction"), - ("daal4py_predict", "alternative_prediction"), - ) - for s, r in replace_pairs: - comb["stage"] = comb["stage"].replace(s, r) - - fixed["results"].append(comb) - - out_fname = fname.stem + "-fixed.json" - with open(out_fname, "w") as fp: - json.dump(fixed, fp, indent=4) - - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("filenames", nargs="+") - args = parser.parse_args() - for fname in args.filenames: - fix_file(Path(fname)) From 8deeebd9185ced373101f2305237ff88bc95157e Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 30 Oct 2023 16:36:34 +0000 Subject: [PATCH 8/8] Update modelbuilders_bench/xgb_mb.py Co-authored-by: Nikolay Petrov --- modelbuilders_bench/xgb_mb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelbuilders_bench/xgb_mb.py b/modelbuilders_bench/xgb_mb.py index 749f9ee16..67b35b0a3 100644 --- a/modelbuilders_bench/xgb_mb.py +++ b/modelbuilders_bench/xgb_mb.py @@ -1,5 +1,5 @@ # ============================================================================== -# Copyright 2020-2023 Intel Corporation +# Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.