From 075217ff81466b7183521e1873beca4ab5ca0b5c Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 25 Sep 2023 02:24:28 -0700 Subject: [PATCH 1/6] initial files for extra trees support --- configs/sklearn/performance/et_clsf.json | 165 +++++++++++++++ configs/sklearn/performance/et_regr.json | 251 +++++++++++++++++++++++ sklearn_bench/et_clsf.py | 98 +++++++++ sklearn_bench/et_regr.py | 90 ++++++++ 4 files changed, 604 insertions(+) create mode 100644 configs/sklearn/performance/et_clsf.json create mode 100644 configs/sklearn/performance/et_regr.json create mode 100644 sklearn_bench/et_clsf.py create mode 100644 sklearn_bench/et_regr.py diff --git a/configs/sklearn/performance/et_clsf.json b/configs/sklearn/performance/et_clsf.json new file mode 100644 index 000000000..a5943f1f3 --- /dev/null +++ b/configs/sklearn/performance/et_clsf.json @@ -0,0 +1,165 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "et_clsf", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "max-features": "sqrt", + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": 50, + "max-depth": 16, + "max-leaf-nodes": 131072, + "max-features": 0.2 + }, + { + "device": "none", + "dataset": [ + { + "source": "npy", + "name": "airline-ohe", + "training": + { + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" + }, + "testing": + { + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": 50, + "max-depth": 16, + "max-leaf-nodes": 131072, + "max-features": 0.2 + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": 10, + "max-depth": 5 + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100, + "max-depth": 8 + }, + { + "dataset": [ + { + "source": "npy", + "name": "susy", + "training": + { + "x": "data/susy_x_train.npy", + "y": "data/susy_y_train.npy" + }, + "testing": + { + "x": "data/susy_x_test.npy", + "y": "data/susy_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": 20, + "max-depth": 16 + }, + { + "dataset": [ + { + "source": "npy", + "name": "mnist", + "training": + { + "x": "data/mnist_x_train.npy", + "y": "data/mnist_y_train.npy" + }, + "testing": + { + "x": "data/mnist_x_test.npy", + "y": "data/mnist_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100, + "max-depth": 10 + }, + { + "dataset": [ + { + "source": "npy", + "name": "hepmass_150K", + "training": + { + "x": "data/hepmass_150K_x_train.npy", + "y": "data/hepmass_150K_y_train.npy" + }, + "testing": + { + "x": "data/hepmass_150K_x_test.npy", + "y": "data/hepmass_150K_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": 50, + "max-depth": 15 + } + ] +} diff --git a/configs/sklearn/performance/et_regr.json b/configs/sklearn/performance/et_regr.json new file mode 100644 index 000000000..d41813ee8 --- /dev/null +++ b/configs/sklearn/performance/et_regr.json @@ -0,0 +1,251 @@ +{ + "common": { + "lib": "sklearn", + "algorithm": "et_regr", + "data-format": "pandas", + "data-order": "F", + "dtype": ["float32", "float64"], + "device": ["host", "cpu", "gpu", "none"] + }, + "cases": [ + { + "device": "none", + "dataset": [ + { + "source": "npy", + "name": "airline_regression", + "training": + { + "x": "data/airline_regression_x_train.npy", + "y": "data/airline_regression_y_train.npy" + }, + "testing": + { + "x": "data/airline_regression_x_test.npy", + "y": "data/airline_regression_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100 + }, + { + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": 10, + "max-depth": 5 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100, + "max-depth": 5 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": 20, + "max-depth": 8 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100, + "max-depth": 8 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 20, + "max-depth": 16 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ], + "workload-size": "medium", + "num-trees": [15, 20], + "max-depth": 8 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100, + "max-depth": 8 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 100, + "max-depth": 8 + }, + { + "max-features": 0.33, + "dataset": [ + { + "source": "npy", + "name": "higgs_10500K", + "training": + { + "x": "data/higgs_10500K_x_train.npy", + "y": "data/higgs_10500K_y_train.npy" + }, + "testing": + { + "x": "data/higgs_10500K_x_test.npy", + "y": "data/higgs_10500K_y_test.npy" + } + } + ], + "workload-size": "large", + "num-trees": 20, + "max-depth": 16 + } + ] +} diff --git a/sklearn_bench/et_clsf.py b/sklearn_bench/et_clsf.py new file mode 100644 index 000000000..366453216 --- /dev/null +++ b/sklearn_bench/et_clsf.py @@ -0,0 +1,98 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse + +import bench +import numpy as np + + +def main(): + from sklearn.ensemble import ExtraTreesClassifier + + # Load and convert data + X_train, X_test, y_train, y_test = bench.load_data(params) + + # Create our extra trees classifier + clf = ExtraTreesClassifier(criterion=params.criterion, + n_estimators=params.num_trees, + max_depth=params.max_depth, + max_features=params.max_features, + min_samples_split=params.min_samples_split, + max_leaf_nodes=params.max_leaf_nodes, + min_impurity_decrease=params.min_impurity_decrease, + bootstrap=params.bootstrap, + random_state=params.seed, + n_jobs=params.n_jobs) + + params.n_classes = len(np.unique(y_train)) + + fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) + y_pred = clf.predict(X_train) + y_proba = clf.predict_proba(X_train) + train_acc = bench.accuracy_score(y_train, y_pred) + train_log_loss = bench.log_loss(y_train, y_proba) + train_roc_auc = bench.roc_auc_score(y_train, y_proba) + + predict_time, y_pred = bench.measure_function_time( + clf.predict, X_test, params=params) + y_proba = clf.predict_proba(X_test) + test_acc = bench.accuracy_score(y_test, y_pred) + test_log_loss = bench.log_loss(y_test, y_proba) + test_roc_auc = bench.roc_auc_score(y_test, y_proba) + + bench.print_output( + library='sklearn', + algorithm='et_clsf', + stages=['training', 'prediction'], + params=params, + functions=['et_clsf.fit', 'et_clsf.predict'], + times=[fit_time, predict_time], + metric_type=['accuracy', 'log_loss', 'roc_auc'], + metrics=[ + [train_acc, test_acc], + [train_log_loss, test_log_loss], + [train_roc_auc, test_roc_auc], + ], + data=[X_train, X_test], + alg_instance=clf, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn extra trees ' + 'classification benchmark') + + parser.add_argument('--criterion', type=str, default='gini', + choices=('gini', 'entropy'), + help='The function to measure the quality of a split') + parser.add_argument('--num-trees', type=int, default=100, + help='Number of trees in the forest') + parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None, + help='Upper bound on features used at each split') + parser.add_argument('--max-depth', type=int, default=None, + help='Upper bound on depth of constructed trees') + parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, + help='Minimum samples number for node splitting') + parser.add_argument('--max-leaf-nodes', type=int, default=None, + help='Maximum leaf nodes per tree') + parser.add_argument('--min-impurity-decrease', type=float, default=0., + help='Needed impurity decrease for node splitting') + parser.add_argument('--no-bootstrap', dest='bootstrap', default=False, + action='store_false', help="Don't control bootstraping") + + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/et_regr.py b/sklearn_bench/et_regr.py new file mode 100644 index 000000000..b9bcef691 --- /dev/null +++ b/sklearn_bench/et_regr.py @@ -0,0 +1,90 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse +import bench + + +def main(): + from sklearn.ensemble import ExtraTreesRegressor + + # Load and convert data + X_train, X_test, y_train, y_test = bench.load_data(params) + y_train = y_train.values.ravel() + y_test = y_test.values.ravel() + + # Create our extra trees regressor + regr = ExtraTreesRegressor(criterion=params.criterion, + n_estimators=params.num_trees, + max_depth=params.max_depth, + max_features=params.max_features, + min_samples_split=params.min_samples_split, + max_leaf_nodes=params.max_leaf_nodes, + min_impurity_decrease=params.min_impurity_decrease, + bootstrap=params.bootstrap, + random_state=params.seed, + n_jobs=params.n_jobs) + + fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params) + + y_pred = regr.predict(X_train) + train_rmse = bench.rmse_score(y_train, y_pred) + train_r2 = bench.r2_score(y_train, y_pred) + + predict_time, y_pred = bench.measure_function_time( + regr.predict, X_test, params=params) + test_rmse = bench.rmse_score(y_test, y_pred) + test_r2 = bench.r2_score(y_test, y_pred) + + bench.print_output( + library='sklearn', + algorithm='et_regr', + stages=['training', 'prediction'], + params=params, + functions=['et_regr.fit', 'et_regr.predict'], + times=[fit_time, predict_time], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_test], + alg_instance=regr, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='scikit-learn extra trees ' + 'regression benchmark') + + parser.add_argument('--criterion', type=str, default='mse', + choices=('mse', 'mae'), + help='The function to measure the quality of a split') + parser.add_argument('--num-trees', type=int, default=100, + help='Number of trees in the forest') + parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None, + help='Upper bound on features used at each split') + parser.add_argument('--max-depth', type=int, default=None, + help='Upper bound on depth of constructed trees') + parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2, + help='Minimum samples number for node splitting') + parser.add_argument('--max-leaf-nodes', type=int, default=None, + help='Grow trees with max_leaf_nodes in best-first fashion' + 'if it is not None') + parser.add_argument('--min-impurity-decrease', type=float, default=0., + help='Needed impurity decrease for node splitting') + parser.add_argument('--no-bootstrap', dest='bootstrap', default=False, + action='store_false', help="Don't control bootstraping") + + params = bench.parse_args(parser) + bench.run_with_context(params, main) From b60b68cf41521b29b7fabe440d7fda5110278b25 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 25 Sep 2023 03:42:18 -0700 Subject: [PATCH 2/6] change readmes for et --- README.md | 6 ++++-- sklearn_bench/README.md | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5eef0bc2e..a7b2e22b6 100755 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ We publish blogs on Medium, so [follow us](https://medium.com/intel-analytics-so - [How to create conda environment for benchmarking](#how-to-create-conda-environment-for-benchmarking) - [Running Python benchmarks with runner script](#running-python-benchmarks-with-runner-script) - [Benchmark supported algorithms](#benchmark-supported-algorithms) - - [Scikit-learn benchmakrs](#scikit-learn-benchmakrs) +- [Scikit-learn benchmarks](#scikit-learn-benchmarks) - [Algorithm parameters](#algorithm-parameters) ## How to create conda environment for benchmarking @@ -105,6 +105,8 @@ The configuration of benchmarks allows you to select the frameworks to run, sele |**[DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html)**|dbscan|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:| |**[RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)**|df_clfs|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:| |**[RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)**|df_regr|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:| +|**[ExtraTreesClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html)**|et_clfs|:white_check_mark:|:x:|:x:|:x:|:x:| +|**[ExtraTreesRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html)**|et_regr|:white_check_mark:|:x:|:x:|:x:|:x:| |**[pairwise_distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html)**|distances|:white_check_mark:|:x:|:white_check_mark:|:x:|:x:| |**[KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)**|kmeans|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:| |**[KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)**|knn_clsf|:white_check_mark:|:x:|:x:|:white_check_mark:|:x:| @@ -118,7 +120,7 @@ The configuration of benchmarks allows you to select the frameworks to run, sele |**[GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| |**[GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| -### Scikit-learn benchmakrs +### Scikit-learn benchmarks When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index bde10c3f9..7d70de27a 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -16,6 +16,8 @@ You can launch benchmarks for each algorithm separately. The tables below list a - [DBSCAN](#dbscan) - [RandomForestClassifier](#randomforestclassifier) - [RandomForestRegressor](#randomforestregressor) +- [ExtraTreesClassifier](#extratreesclassifier) +- [ExtraTreesRegressor](#extratreesregressor) - [pairwise_distances](#pairwise_distances) - [KMeans](#kmeans) - [KNeighborsClassifier](#kneighborsclassifier) @@ -85,6 +87,33 @@ You can launch benchmarks for each algorithm separately. The tables below list a | no-bootstrap | action | True | Don't control bootstraping | | use-sklearn-class | action | | Force use of sklearn.ensemble.RandomForestClassifier | +### ExtraTreesClassifier + +| parameter Name | Type | default value | description | +| ----- | ---- |---- |---- | +| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | +| num-trees | int | 100 | The number of trees in the forest | +| max-features | float_or_int | None | Upper bound on features used at each split | +| max-depth | int | None | Upper bound on depth of constructed trees | +| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | +| max-leaf-nodes | int | None | Maximum leaf nodes per tree | +| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | +| no-bootstrap | store_false | False | Don't control bootstraping | + +### ExtraTreesRegressor + +| parameter Name | Type | default value | description | +| ----- | ---- |---- |---- | +| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | +| num-trees | int | 100 | The number of trees in the forest | +| max-features | float_or_int | None | Upper bound on features used at each split | +| max-depth | int | None | Upper bound on depth of constructed trees | +| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting | +| max-leaf-nodes | int | None | Maximum leaf nodes per tree | +| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting | +| no-bootstrap | action | False | Don't control bootstraping | +| use-sklearn-class | action | | Force use of sklearn.ensemble.ExtraTreesClassifier | + ### pairwise_distances | parameter Name | Type | default value | description | From 4839a3d162b8b542ddf7e94916b4d039fdcff1a8 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 25 Sep 2023 03:44:53 -0700 Subject: [PATCH 3/6] correcting mistakes in master --- sklearn_bench/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index 7d70de27a..8bdc3c29e 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -77,7 +77,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | +| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split | | num-trees | int | 100 | The number of trees in the forest | | max-features | float_or_int | None | Upper bound on features used at each split | | max-depth | int | None | Upper bound on depth of constructed trees | @@ -104,7 +104,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | -| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split | +| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split | | num-trees | int | 100 | The number of trees in the forest | | max-features | float_or_int | None | Upper bound on features used at each split | | max-depth | int | None | Upper bound on depth of constructed trees | From 04114e99bc4c14aa70eccf18cd4deec2425f6115 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 25 Sep 2023 12:58:20 +0200 Subject: [PATCH 4/6] Update sklearn.json --- configs/testing/sklearn.json | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/configs/testing/sklearn.json b/configs/testing/sklearn.json index f114ef793..83b1e5133 100644 --- a/configs/testing/sklearn.json +++ b/configs/testing/sklearn.json @@ -57,6 +57,42 @@ } ], "num-trees": 10 + }, + { + "algorithm": "et_clsf", + "dataset": [ + { + "source": "synthetic", + "type": "classification", + "n_classes": 5, + "n_features": 10, + "training": { + "n_samples": 1000 + }, + "testing": { + "n_samples": 20 + } + } + ], + "num-trees": 10 + }, + { + "algorithm": "et_regr", + "dataset": [ + { + "source": "synthetic", + "type": "regression", + "n_classes": 5, + "n_features": 10, + "training": { + "n_samples": 100 + }, + "testing": { + "n_samples": 20 + } + } + ], + "num-trees": 10 }, { "algorithm": "ridge", From 09385d6f58ea40fab22f1d150403ca51490d6677 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Mon, 25 Sep 2023 13:02:09 +0200 Subject: [PATCH 5/6] Update skl_config.json --- configs/skl_config.json | 72 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/configs/skl_config.json b/configs/skl_config.json index f3f1fa93f..150735b00 100644 --- a/configs/skl_config.json +++ b/configs/skl_config.json @@ -182,6 +182,78 @@ } ] }, + { + "algorithm": "et_clsf", + "dtype": "float32", + "dataset": [ + { + "source": "npy", + "name": "higgs1m", + "training": + { + "x": "data/higgs1m_x_train.npy", + "y": "data/higgs1m_y_train.npy" + }, + "testing": + { + "x": "data/higgs1m_x_test.npy", + "y": "data/higgs1m_y_test.npy" + } + }, + { + "source": "npy", + "name": "airline-ohe", + "training": + { + "x": "data/airline-ohe_x_train.npy", + "y": "data/airline-ohe_y_train.npy" + }, + "testing": + { + "x": "data/airline-ohe_x_test.npy", + "y": "data/airline-ohe_y_test.npy" + } + } + ], + "num-trees": 50, + "max-depth": 16, + "max-leaf-nodes": 131072, + "max-features": 0.2 + }, + { + "algorithm": "et_regr", + "dtype": "float32", + "dataset": [ + { + "source": "npy", + "name": "year_prediction_msd", + "training": + { + "x": "data/year_prediction_msd_x_train.npy", + "y": "data/year_prediction_msd_y_train.npy" + }, + "testing": + { + "x": "data/year_prediction_msd_x_test.npy", + "y": "data/year_prediction_msd_y_test.npy" + } + }, + { + "source": "npy", + "name": "airline_regression", + "training": + { + "x": "data/airline_regression_x_train.npy", + "y": "data/airline_regression_y_train.npy" + }, + "testing": + { + "x": "data/airline_regression_x_test.npy", + "y": "data/airline_regression_y_test.npy" + } + } + ] + }, { "algorithm": "ridge", "dataset": [ From 092de2c23d47b680c0a99759a4f51a6ecbae4346 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 25 Sep 2023 04:11:02 -0700 Subject: [PATCH 6/6] mse -> squared_error --- sklearn_bench/README.md | 4 ++-- sklearn_bench/df_regr.py | 4 ++-- sklearn_bench/et_regr.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md index 8bdc3c29e..a2353fd9c 100644 --- a/sklearn_bench/README.md +++ b/sklearn_bench/README.md @@ -77,7 +77,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | -| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split | +| criterion | str | squared_error | *squared_error* or *absoulte_error*. The function to measure the quality of a split | | num-trees | int | 100 | The number of trees in the forest | | max-features | float_or_int | None | Upper bound on features used at each split | | max-depth | int | None | Upper bound on depth of constructed trees | @@ -104,7 +104,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a | parameter Name | Type | default value | description | | ----- | ---- |---- |---- | -| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split | +| criterion | str | squared_error | *squared_error* or *absoulte_error*. The function to measure the quality of a split | | num-trees | int | 100 | The number of trees in the forest | | max-features | float_or_int | None | Upper bound on features used at each split | | max-depth | int | None | Upper bound on depth of constructed trees | diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py index 4c7491af3..baa5bb475 100644 --- a/sklearn_bench/df_regr.py +++ b/sklearn_bench/df_regr.py @@ -67,8 +67,8 @@ def main(): parser = argparse.ArgumentParser(description='scikit-learn random forest ' 'regression benchmark') - parser.add_argument('--criterion', type=str, default='mse', - choices=('mse', 'mae'), + parser.add_argument('--criterion', type=str, default='squared_error', + choices=('squared_error', 'absolute_error'), help='The function to measure the quality of a split') parser.add_argument('--num-trees', type=int, default=100, help='Number of trees in the forest') diff --git a/sklearn_bench/et_regr.py b/sklearn_bench/et_regr.py index b9bcef691..29bcc8f02 100644 --- a/sklearn_bench/et_regr.py +++ b/sklearn_bench/et_regr.py @@ -67,8 +67,8 @@ def main(): parser = argparse.ArgumentParser(description='scikit-learn extra trees ' 'regression benchmark') - parser.add_argument('--criterion', type=str, default='mse', - choices=('mse', 'mae'), + parser.add_argument('--criterion', type=str, default='squared_error', + choices=('squared_error', 'absolute_error'), help='The function to measure the quality of a split') parser.add_argument('--num-trees', type=int, default=100, help='Number of trees in the forest')