From f77bf7f0bf261f72005f44361af28347091f3b23 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 4 Jul 2023 07:38:31 -0700 Subject: [PATCH 1/5] initial --- configs/daal4py/daal4py_gbt_config.json | 280 ++++++++++++++++++++++++ daal4py_bench/gbt.py | 172 +++++++++++++++ 2 files changed, 452 insertions(+) create mode 100644 configs/daal4py/daal4py_gbt_config.json create mode 100644 daal4py_bench/gbt.py diff --git a/configs/daal4py/daal4py_gbt_config.json b/configs/daal4py/daal4py_gbt_config.json new file mode 100644 index 000000000..55a25c0d1 --- /dev/null +++ b/configs/daal4py/daal4py_gbt_config.json @@ -0,0 +1,280 @@ +{ + "common": { + "lib": "daal4py", + "data-format": "pandas", + "data-order": "F", + "fptype": "float", + "algorithm": "gbt" + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "abalone", + "training": { + "x": "data/abalone_x_train.npy", + "y": "data/abalone_y_train.npy" + }, + "testing": { + "x": "data/abalone_x_test.npy", + "y": "data/abalone_y_test.npy" + } + } + ], + "max_tree_depth": 6, + "n_estimators": 1000, + "objective": "reg:squarederror" + }, + { + "dataset": [ + { + "source": "npy", + "name": "airline-ohe", + "training": { + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" + }, + "testing": { + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" + } + } + ], + "reg_lambda": 1, + "max_tree_depth": 8, + "n_estimators": 1000, + "objective": "binary:logistic" + }, + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ], + "max_bins": 256, + "reg_lambda": 1, + "max_tree_depth": 8, + "n_estimators": [100, 300, 1000, 3000, 10000, 30000], + "objective": "binary:logistic" + }, + { + "dataset": [ + { + "source": "npy", + "name": "letters", + "training": { + "x": "data/letters_x_train.npy", + "y": "data/letters_y_train.npy" + }, + "testing": { + "x": "data/letters_x_test.npy", + "y": "data/letters_y_test.npy" + } + } + ], + "max_tree_depth": 6, + "n_estimators": 1000, + "objective": "multi:softprob" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mlsr", + "training": { + "x": "data/mlsr_x_train.npy", + "y": "data/mlsr_y_train.npy" + } + } + ], + "max_bins": 256, + "reg_lambda": 2, + "max_tree_depth": 8, + "n_estimators": 200, + "objective": "multi:softprob" + }, + { + "dataset": [ + { + "source": "npy", + "name": "mortgage1Q", + "training": { + "x": "data/mortgage1Q_x_train.npy", + "y": "data/mortgage1Q_y_train.npy" + } + } + ], + "n_estimators": 100, + "objective": "reg:squarederror", + "max_tree_depth": 8, + "reg_lambda": 1 + }, + { + "dataset": [ + { + "source": "npy", + "name": "plasticc", + "training": { + "x": "data/plasticc_x_train.npy", + "y": "data/plasticc_y_train.npy" + }, + "testing": { + "x": "data/plasticc_x_test.npy", + "y": "data/plasticc_y_test.npy" + } + } + ], + "n_estimators": 60, + "objective": "multi:softprob", + "max_tree_depth": 7 + }, + { + "dataset": [ + { + "source": "npy", + "name": "santander", + "training": { + "x": "data/santander_x_train.npy", + "y": "data/santander_y_train.npy" + }, + "testing": { + "x": "data/santander_x_test.npy", + "y": "data/santander_y_test.npy" + } + } + ], + "n_estimators": 10000, + "objective": "binary:logistic", + "max_tree_depth": 1 + }, + { + "objective": "binary:logistic", + "dataset": [ + { + "source": "npy", + "name": "airline", + "training": { + "x": "data/airline_x_train.npy", + "y": "data/airline_y_train.npy" + }, + "testing": { + "x": "data/airline_x_test.npy", + "y": "data/airline_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "dataset": [ + { + "source": "npy", + "name": "bosch", + "training": { + "x": "data/bosch_x_train.npy", + "y": "data/bosch_y_train.npy" + }, + "testing": { + "x": "data/bosch_x_test.npy", + "y": "data/bosch_y_test.npy" + } + } + ] + }, + { + "objective": "multi:softmax", + "dataset": [ + { + "source": "npy", + "name": "covtype", + "training": { + "x": "data/covtype_x_train.npy", + "y": "data/covtype_y_train.npy" + }, + "testing": { + "x": "data/covtype_x_test.npy", + "y": "data/covtype_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "dataset": [ + { + "source": "npy", + "name": "epsilon", + "training": { + "x": "data/epsilon_x_train.npy", + "y": "data/epsilon_y_train.npy" + }, + "testing": { + "x": "data/epsilon_x_test.npy", + "y": "data/epsilon_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "dataset": [ + { + "source": "npy", + "name": "fraud", + "training": { + "x": "data/fraud_x_train.npy", + "y": "data/fraud_y_train.npy" + }, + "testing": { + "x": "data/fraud_x_test.npy", + "y": "data/fraud_y_test.npy" + } + } + ] + }, + { + "objective": "binary:logistic", + "dataset": [ + { + "source": "npy", + "name": "higgs", + "training": { + "x": "data/higgs_x_train.npy", + "y": "data/higgs_y_train.npy" + }, + "testing": { + "x": "data/higgs_x_test.npy", + "y": "data/higgs_y_test.npy" + } + } + ] + }, + { + "objective": "reg:squarederror", + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ] + } + ] +} \ No newline at end of file diff --git a/daal4py_bench/gbt.py b/daal4py_bench/gbt.py new file mode 100644 index 000000000..666a787f6 --- /dev/null +++ b/daal4py_bench/gbt.py @@ -0,0 +1,172 @@ +# =============================================================================== +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse +import logging +import sys + +import bench +import numpy as np +from sklearn.utils import check_random_state +from sklearn import preprocessing +import daal4py as d4p + + +parser = argparse.ArgumentParser(description='daal4py gradient boosted trees benchmark') + +parser.add_argument('--max_bins', type=int, default=256, + help='Maximum number of discrete bins to ' + 'bucket continuous features') +parser.add_argument('--min_bin_size', type=int, default=5, + help='Minimum size of discrete bins') +parser.add_argument('--max_tree_depth', type=int, default=8, + help='Maximum depth of a tree') +parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, + help='Minimum loss reduction required to make' + ' partition on a leaf node') +parser.add_argument('--n_estimators', type=int, default=100, + help='The number of gradient boosted trees') +parser.add_argument('--reg_lambda', type=float, default=1, + help='L2 regularization term on weights') +parser.add_argument('--split_method', type=str, required=False, default='inexact', + help='The split algorithm used in daal4py') +parser.add_argument('--shrinkage', type=float, default=0.3, + help='Shrinkage rate') +parser.add_argument('--min_split_loss', type=float, default=0, + help='Minimal spilit loss') +parser.add_argument('--observations_per_tree_fraction', type=int, default=1, + help='Observations per tree fraction') +parser.add_argument('--features_per_node', type=int, default=0, + help='Features per node') +parser.add_argument('--min_observations_in_leaf_node', type=int, default=5, + help='Min observations in leaf node') +parser.add_argument('--memory_saving_mode', type=bool, default=False, + help='Enable memory saving mode') +parser.add_argument('--random_state', type=str, default=None, + help='Pass random state') +parser.add_argument('--objective', type=str, default="reg:squarederror", + help='Objective function') +parser.add_argument('--fptype', type=str, default="float", + help='FPType to use') + +params = bench.parse_args(parser) + +# Load and convert data +X_train, X_test, y_train, y_test = bench.load_data(params) + +if np.isnan(X_test.values).any(): + logging.warning('Nan values aren not supported in GBT DAAL fit yet') + sys.exit(1) + +# Get random seed +rs_ = check_random_state(params.random_state) +seed_ = rs_.randint(0, np.iinfo('i').max) + +d4p_params = { + 'split_method': params.split_method, + 'n_estimators': params.n_estimators, + 'max_tree_depth': params.max_tree_depth, + 'shrinkage': params.shrinkage, + 'min_split_loss': params.min_split_loss, + 'reg_lambda': params.reg_lambda, + 'objective': params.objective, + 'fptype': params.fptype, + 'observations_per_tree_fraction': params.observations_per_tree_fraction, + 'features_per_node': params.features_per_node, + 'min_observations_in_leaf_node': params.min_observations_in_leaf_node, + 'memory_saving_mode': params.memory_saving_mode, + 'max_bins': params.max_bins, + 'min_bin_size': params.min_bin_size, + 'random_state': params.random_state} + +if d4p_params["objective"].startswith('reg'): + task = "regression" + metric_name, metric_func = 'rmse', bench.rmse_score + train_algo = d4p.gbt_regression_training( + fptype=d4p_params["fptype"], + splitMethod=d4p_params["split_method"], + maxIterations=d4p_params["n_estimators"], + maxTreeDepth=d4p_params["max_tree_depth"], + shrinkage=d4p_params["shrinkage"], + minSplitLoss=d4p_params["min_split_loss"], + lambda_=d4p_params["reg_lambda"], + observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], + featuresPerNode=d4p_params["features_per_node"], + minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], + memorySavingMode=d4p_params["memory_saving_mode"], + maxBins=d4p_params["max_bins"], + minBinSize=d4p_params["min_bin_size"], + engine=d4p.engines_mcg59(seed=seed_)) +else: + task = "classification" + metric_name = 'accuracy' + metric_func = bench.accuracy_score + le = preprocessing.LabelEncoder() + le.fit(y_train) + + n_classes = len(le.classes_) + # Covtype has one class more than there is in train + if params.dataset_name == 'covtype': + n_classes += 1 + # n_iterations = int(d4p_params["n_estimators"] / (n_classes if n_classes > 2 else 1)) + n_iterations = d4p_params["n_estimators"] + train_algo = d4p.gbt_classification_training( + fptype=d4p_params["fptype"], + nClasses=n_classes, + splitMethod=d4p_params["split_method"], + maxIterations=n_iterations, + maxTreeDepth=d4p_params["max_tree_depth"], + shrinkage=d4p_params["shrinkage"], + minSplitLoss=d4p_params["min_split_loss"], + lambda_=d4p_params["reg_lambda"], + observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], + featuresPerNode=d4p_params["features_per_node"], + minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], + memorySavingMode=d4p_params["memory_saving_mode"], + maxBins=d4p_params["max_bins"], + minBinSize=d4p_params["min_bin_size"], + engine=d4p.engines_mcg59(seed=seed_)) + + +def fit(X_train, y_train): + return train_algo.compute(X_train, y_train).model + +def predict(X_test): # type: ignore + if task == "regression": + predict_algo = d4p.gbt_regression_prediction(fptype=d4p_params["fptype"]) + else: + predict_algo = d4p.gbt_classification_prediction(fptype=d4p_params["fptype"], + nClasses=n_classes, + resultsToEvaluate="computeClassLabels") + return predict_algo.compute(X_test, booster).prediction.ravel() + + +for i in range(5): + fit_time, booster = bench.measure_function_time( + fit, X_train, y_train, params=params) + train_metric = metric_func( + predict(X_train), y_train) + + predict_time, y_pred = bench.measure_function_time( + predict, X_test, params=params) + test_metric = metric_func(y_pred, y_test) + + bench.print_output(library='daal4py', algorithm=f'gradient_boosted_trees_{task}', + stages=['training', 'prediction'], + params=params, functions=['gbt.fit', 'gbt.predict'], + times=[fit_time, predict_time], metric_type=metric_name, + metrics=[train_metric, test_metric], data=[X_train, X_test], + alg_params=d4p_params) From 5d9cc5a53f8071d3d6cca7e0f208f4bf1ead727d Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 4 Jul 2023 08:09:43 -0700 Subject: [PATCH 2/5] pep8 --- daal4py_bench/gbt.py | 114 +++++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/daal4py_bench/gbt.py b/daal4py_bench/gbt.py index 666a787f6..60928bfee 100644 --- a/daal4py_bench/gbt.py +++ b/daal4py_bench/gbt.py @@ -25,7 +25,8 @@ import daal4py as d4p -parser = argparse.ArgumentParser(description='daal4py gradient boosted trees benchmark') +parser = argparse.ArgumentParser( + description='daal4py gradient boosted trees benchmark') parser.add_argument('--max_bins', type=int, default=256, help='Maximum number of discrete bins to ' @@ -41,7 +42,8 @@ help='The number of gradient boosted trees') parser.add_argument('--reg_lambda', type=float, default=1, help='L2 regularization term on weights') -parser.add_argument('--split_method', type=str, required=False, default='inexact', +parser.add_argument('--split_method', type=str, required=False, + default='inexact', help='The split algorithm used in daal4py') parser.add_argument('--shrinkage', type=float, default=0.3, help='Shrinkage rate') @@ -96,20 +98,20 @@ task = "regression" metric_name, metric_func = 'rmse', bench.rmse_score train_algo = d4p.gbt_regression_training( - fptype=d4p_params["fptype"], - splitMethod=d4p_params["split_method"], - maxIterations=d4p_params["n_estimators"], - maxTreeDepth=d4p_params["max_tree_depth"], - shrinkage=d4p_params["shrinkage"], - minSplitLoss=d4p_params["min_split_loss"], - lambda_=d4p_params["reg_lambda"], - observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], - featuresPerNode=d4p_params["features_per_node"], - minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], - memorySavingMode=d4p_params["memory_saving_mode"], - maxBins=d4p_params["max_bins"], - minBinSize=d4p_params["min_bin_size"], - engine=d4p.engines_mcg59(seed=seed_)) + fptype=d4p_params["fptype"], + splitMethod=d4p_params["split_method"], + maxIterations=d4p_params["n_estimators"], + maxTreeDepth=d4p_params["max_tree_depth"], + shrinkage=d4p_params["shrinkage"], + minSplitLoss=d4p_params["min_split_loss"], + lambda_=d4p_params["reg_lambda"], + observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], + featuresPerNode=d4p_params["features_per_node"], + minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], + memorySavingMode=d4p_params["memory_saving_mode"], + maxBins=d4p_params["max_bins"], + minBinSize=d4p_params["min_bin_size"], + engine=d4p.engines_mcg59(seed=seed_)) else: task = "classification" metric_name = 'accuracy' @@ -121,52 +123,56 @@ # Covtype has one class more than there is in train if params.dataset_name == 'covtype': n_classes += 1 - # n_iterations = int(d4p_params["n_estimators"] / (n_classes if n_classes > 2 else 1)) n_iterations = d4p_params["n_estimators"] train_algo = d4p.gbt_classification_training( - fptype=d4p_params["fptype"], - nClasses=n_classes, - splitMethod=d4p_params["split_method"], - maxIterations=n_iterations, - maxTreeDepth=d4p_params["max_tree_depth"], - shrinkage=d4p_params["shrinkage"], - minSplitLoss=d4p_params["min_split_loss"], - lambda_=d4p_params["reg_lambda"], - observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], - featuresPerNode=d4p_params["features_per_node"], - minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], - memorySavingMode=d4p_params["memory_saving_mode"], - maxBins=d4p_params["max_bins"], - minBinSize=d4p_params["min_bin_size"], - engine=d4p.engines_mcg59(seed=seed_)) + fptype=d4p_params["fptype"], + nClasses=n_classes, + splitMethod=d4p_params["split_method"], + maxIterations=n_iterations, + maxTreeDepth=d4p_params["max_tree_depth"], + shrinkage=d4p_params["shrinkage"], + minSplitLoss=d4p_params["min_split_loss"], + lambda_=d4p_params["reg_lambda"], + observationsPerTreeFraction=d4p_params["observations_per_tree_fraction"], + featuresPerNode=d4p_params["features_per_node"], + minObservationsInLeafNode=d4p_params["min_observations_in_leaf_node"], + memorySavingMode=d4p_params["memory_saving_mode"], + maxBins=d4p_params["max_bins"], + minBinSize=d4p_params["min_bin_size"], + engine=d4p.engines_mcg59(seed=seed_)) def fit(X_train, y_train): return train_algo.compute(X_train, y_train).model + def predict(X_test): # type: ignore if task == "regression": - predict_algo = d4p.gbt_regression_prediction(fptype=d4p_params["fptype"]) + predict_algo = d4p.gbt_regression_prediction( + fptype=d4p_params["fptype"]) else: - predict_algo = d4p.gbt_classification_prediction(fptype=d4p_params["fptype"], - nClasses=n_classes, - resultsToEvaluate="computeClassLabels") + predict_algo = d4p.gbt_classification_prediction( + fptype=d4p_params["fptype"], + nClasses=n_classes, + resultsToEvaluate="computeClassLabels") return predict_algo.compute(X_test, booster).prediction.ravel() - - -for i in range(5): - fit_time, booster = bench.measure_function_time( - fit, X_train, y_train, params=params) - train_metric = metric_func( - predict(X_train), y_train) - - predict_time, y_pred = bench.measure_function_time( - predict, X_test, params=params) - test_metric = metric_func(y_pred, y_test) - - bench.print_output(library='daal4py', algorithm=f'gradient_boosted_trees_{task}', - stages=['training', 'prediction'], - params=params, functions=['gbt.fit', 'gbt.predict'], - times=[fit_time, predict_time], metric_type=metric_name, - metrics=[train_metric, test_metric], data=[X_train, X_test], - alg_params=d4p_params) + + +fit_time, booster = bench.measure_function_time( + fit, X_train, y_train, params=params) +train_metric = metric_func( + predict(X_train), y_train) + +predict_time, y_pred = bench.measure_function_time( + predict, X_test, params=params) +test_metric = metric_func(y_pred, y_test) + +bench.print_output( + library='daal4py', + algorithm=f'gradient_boosted_trees_{task}', + stages=['training', 'prediction'], + params=params, functions=['gbt.fit', 'gbt.predict'], + times=[fit_time, predict_time], metric_type=metric_name, + metrics=[train_metric, test_metric], + data=[X_train, X_test], + alg_params=d4p_params) From a449cf02cd4be1a73b1d9951a2da83ec9fd48ff0 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Thu, 6 Jul 2023 06:04:07 -0700 Subject: [PATCH 3/5] Replace np.iinfo(np.int32).max by 2**31 --- daal4py_bench/gbt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daal4py_bench/gbt.py b/daal4py_bench/gbt.py index 60928bfee..e781d3432 100644 --- a/daal4py_bench/gbt.py +++ b/daal4py_bench/gbt.py @@ -75,7 +75,7 @@ # Get random seed rs_ = check_random_state(params.random_state) -seed_ = rs_.randint(0, np.iinfo('i').max) +seed_ = rs_.randint(0, 2**31) d4p_params = { 'split_method': params.split_method, From a623c016874ca1f4ba1c6ef37e64259ee886b2c8 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Fri, 8 Dec 2023 14:42:57 +0100 Subject: [PATCH 4/5] Update configs/daal4py/daal4py_gbt_config.json Co-authored-by: Nikolay Petrov --- configs/daal4py/daal4py_gbt_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/daal4py/daal4py_gbt_config.json b/configs/daal4py/daal4py_gbt_config.json index 55a25c0d1..422208b32 100644 --- a/configs/daal4py/daal4py_gbt_config.json +++ b/configs/daal4py/daal4py_gbt_config.json @@ -277,4 +277,4 @@ ] } ] -} \ No newline at end of file +} From 27d8f49550809744980d1af6f2becceb93ec1fec Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Fri, 8 Dec 2023 14:43:05 +0100 Subject: [PATCH 5/5] Update daal4py_bench/gbt.py Co-authored-by: Nikolay Petrov --- daal4py_bench/gbt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daal4py_bench/gbt.py b/daal4py_bench/gbt.py index e781d3432..02aa60514 100644 --- a/daal4py_bench/gbt.py +++ b/daal4py_bench/gbt.py @@ -1,5 +1,5 @@ # =============================================================================== -# Copyright 2020-2021 Intel Corporation +# Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.