From 075217ff81466b7183521e1873beca4ab5ca0b5c Mon Sep 17 00:00:00 2001
From: icfaust <icfaust@gmail.com>
Date: Mon, 25 Sep 2023 02:24:28 -0700
Subject: [PATCH 1/6] initial files for extra trees support

---
 configs/sklearn/performance/et_clsf.json | 165 +++++++++++++++
 configs/sklearn/performance/et_regr.json | 251 +++++++++++++++++++++++
 sklearn_bench/et_clsf.py                 |  98 +++++++++
 sklearn_bench/et_regr.py                 |  90 ++++++++
 4 files changed, 604 insertions(+)
 create mode 100644 configs/sklearn/performance/et_clsf.json
 create mode 100644 configs/sklearn/performance/et_regr.json
 create mode 100644 sklearn_bench/et_clsf.py
 create mode 100644 sklearn_bench/et_regr.py

diff --git a/configs/sklearn/performance/et_clsf.json b/configs/sklearn/performance/et_clsf.json
new file mode 100644
index 000000000..a5943f1f3
--- /dev/null
+++ b/configs/sklearn/performance/et_clsf.json
@@ -0,0 +1,165 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "et_clsf",
+        "data-format": "pandas",
+        "data-order": "F",
+        "dtype": ["float32", "float64"],
+        "max-features": "sqrt",
+        "device": ["host", "cpu", "gpu", "none"]
+    },
+    "cases": [
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "data/higgs1m_x_train.npy",
+                        "y": "data/higgs1m_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs1m_x_test.npy",
+                        "y": "data/higgs1m_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": 50,
+            "max-depth": 16,
+            "max-leaf-nodes": 131072,
+            "max-features": 0.2
+        },
+        {
+            "device": "none",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "airline-ohe",
+                    "training":
+                    {
+                        "x": "data/airline-ohe_x_train.npy",
+                        "y": "data/airline-ohe_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/airline-ohe_x_test.npy",
+                        "y": "data/airline-ohe_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": 50,
+            "max-depth": 16,
+            "max-leaf-nodes": 131072,
+            "max-features": 0.2
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "susy",
+                    "training":
+                    {
+                        "x": "data/susy_x_train.npy",
+                        "y": "data/susy_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/susy_x_test.npy",
+                        "y": "data/susy_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": 10,
+            "max-depth": 5
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "susy",
+                    "training":
+                    {
+                        "x": "data/susy_x_train.npy",
+                        "y": "data/susy_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/susy_x_test.npy",
+                        "y": "data/susy_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100,
+            "max-depth": 8
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "susy",
+                    "training":
+                    {
+                        "x": "data/susy_x_train.npy",
+                        "y": "data/susy_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/susy_x_test.npy",
+                        "y": "data/susy_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": 20,
+            "max-depth": 16
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "mnist",
+                    "training":
+                    {
+                        "x": "data/mnist_x_train.npy",
+                        "y": "data/mnist_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/mnist_x_test.npy",
+                        "y": "data/mnist_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100,
+            "max-depth": 10
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "hepmass_150K",
+                    "training":
+                    {
+                        "x": "data/hepmass_150K_x_train.npy",
+                        "y": "data/hepmass_150K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/hepmass_150K_x_test.npy",
+                        "y": "data/hepmass_150K_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": 50,
+            "max-depth": 15
+        }
+    ]
+}
diff --git a/configs/sklearn/performance/et_regr.json b/configs/sklearn/performance/et_regr.json
new file mode 100644
index 000000000..d41813ee8
--- /dev/null
+++ b/configs/sklearn/performance/et_regr.json
@@ -0,0 +1,251 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "et_regr",
+        "data-format": "pandas",
+        "data-order": "F",
+        "dtype": ["float32", "float64"],
+        "device": ["host", "cpu", "gpu", "none"]
+    },
+    "cases": [
+        {
+            "device": "none",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "airline_regression",
+                    "training":
+                    {
+                        "x": "data/airline_regression_x_train.npy",
+                        "y": "data/airline_regression_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/airline_regression_x_test.npy",
+                        "y": "data/airline_regression_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": 10,
+            "max-depth": 5
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100,
+            "max-depth": 5
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": 20,
+            "max-depth": 8
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100,
+            "max-depth": 8
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 20,
+            "max-depth": 16
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "data/higgs1m_x_train.npy",
+                        "y": "data/higgs1m_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs1m_x_test.npy",
+                        "y": "data/higgs1m_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "medium",
+            "num-trees": [15, 20],
+            "max-depth": 8
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "data/higgs1m_x_train.npy",
+                        "y": "data/higgs1m_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs1m_x_test.npy",
+                        "y": "data/higgs1m_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100,
+            "max-depth": 8
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs_10500K",
+                    "training":
+                    {
+                        "x": "data/higgs_10500K_x_train.npy",
+                        "y": "data/higgs_10500K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs_10500K_x_test.npy",
+                        "y": "data/higgs_10500K_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 100,
+            "max-depth": 8
+        },
+        {
+            "max-features": 0.33,
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs_10500K",
+                    "training":
+                    {
+                        "x": "data/higgs_10500K_x_train.npy",
+                        "y": "data/higgs_10500K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs_10500K_x_test.npy",
+                        "y": "data/higgs_10500K_y_test.npy"
+                    }
+                }
+            ],
+            "workload-size": "large",
+            "num-trees": 20,
+            "max-depth": 16
+        }
+    ]
+}
diff --git a/sklearn_bench/et_clsf.py b/sklearn_bench/et_clsf.py
new file mode 100644
index 000000000..366453216
--- /dev/null
+++ b/sklearn_bench/et_clsf.py
@@ -0,0 +1,98 @@
+# ===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===============================================================================
+
+import argparse
+
+import bench
+import numpy as np
+
+
+def main():
+    from sklearn.ensemble import ExtraTreesClassifier
+
+    # Load and convert data
+    X_train, X_test, y_train, y_test = bench.load_data(params)
+
+    # Create our extra trees classifier
+    clf = ExtraTreesClassifier(criterion=params.criterion,
+                               n_estimators=params.num_trees,
+                               max_depth=params.max_depth,
+                               max_features=params.max_features,
+                               min_samples_split=params.min_samples_split,
+                               max_leaf_nodes=params.max_leaf_nodes,
+                               min_impurity_decrease=params.min_impurity_decrease,
+                               bootstrap=params.bootstrap,
+                               random_state=params.seed,
+                               n_jobs=params.n_jobs)
+
+    params.n_classes = len(np.unique(y_train))
+
+    fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params)
+    y_pred = clf.predict(X_train)
+    y_proba = clf.predict_proba(X_train)
+    train_acc = bench.accuracy_score(y_train, y_pred)
+    train_log_loss = bench.log_loss(y_train, y_proba)
+    train_roc_auc = bench.roc_auc_score(y_train, y_proba)
+
+    predict_time, y_pred = bench.measure_function_time(
+        clf.predict, X_test, params=params)
+    y_proba = clf.predict_proba(X_test)
+    test_acc = bench.accuracy_score(y_test, y_pred)
+    test_log_loss = bench.log_loss(y_test, y_proba)
+    test_roc_auc = bench.roc_auc_score(y_test, y_proba)
+
+    bench.print_output(
+        library='sklearn',
+        algorithm='et_clsf',
+        stages=['training', 'prediction'],
+        params=params,
+        functions=['et_clsf.fit', 'et_clsf.predict'],
+        times=[fit_time, predict_time],
+        metric_type=['accuracy', 'log_loss', 'roc_auc'],
+        metrics=[
+            [train_acc, test_acc],
+            [train_log_loss, test_log_loss],
+            [train_roc_auc, test_roc_auc],
+        ],
+        data=[X_train, X_test],
+        alg_instance=clf,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='scikit-learn extra trees '
+                                                 'classification benchmark')
+
+    parser.add_argument('--criterion', type=str, default='gini',
+                        choices=('gini', 'entropy'),
+                        help='The function to measure the quality of a split')
+    parser.add_argument('--num-trees', type=int, default=100,
+                        help='Number of trees in the forest')
+    parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None,
+                        help='Upper bound on features used at each split')
+    parser.add_argument('--max-depth', type=int, default=None,
+                        help='Upper bound on depth of constructed trees')
+    parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
+                        help='Minimum samples number for node splitting')
+    parser.add_argument('--max-leaf-nodes', type=int, default=None,
+                        help='Maximum leaf nodes per tree')
+    parser.add_argument('--min-impurity-decrease', type=float, default=0.,
+                        help='Needed impurity decrease for node splitting')
+    parser.add_argument('--no-bootstrap', dest='bootstrap', default=False,
+                        action='store_false', help="Don't control bootstraping")
+
+    params = bench.parse_args(parser)
+    bench.run_with_context(params, main)
diff --git a/sklearn_bench/et_regr.py b/sklearn_bench/et_regr.py
new file mode 100644
index 000000000..b9bcef691
--- /dev/null
+++ b/sklearn_bench/et_regr.py
@@ -0,0 +1,90 @@
+# ===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===============================================================================
+
+import argparse
+import bench
+
+
+def main():
+    from sklearn.ensemble import ExtraTreesRegressor
+
+    # Load and convert data
+    X_train, X_test, y_train, y_test = bench.load_data(params)
+    y_train = y_train.values.ravel()
+    y_test = y_test.values.ravel()
+
+    # Create our extra trees regressor
+    regr = ExtraTreesRegressor(criterion=params.criterion,
+                               n_estimators=params.num_trees,
+                               max_depth=params.max_depth,
+                               max_features=params.max_features,
+                               min_samples_split=params.min_samples_split,
+                               max_leaf_nodes=params.max_leaf_nodes,
+                               min_impurity_decrease=params.min_impurity_decrease,
+                               bootstrap=params.bootstrap,
+                               random_state=params.seed,
+                               n_jobs=params.n_jobs)
+
+    fit_time, _ = bench.measure_function_time(regr.fit, X_train, y_train, params=params)
+
+    y_pred = regr.predict(X_train)
+    train_rmse = bench.rmse_score(y_train, y_pred)
+    train_r2 = bench.r2_score(y_train, y_pred)
+
+    predict_time, y_pred = bench.measure_function_time(
+        regr.predict, X_test, params=params)
+    test_rmse = bench.rmse_score(y_test, y_pred)
+    test_r2 = bench.r2_score(y_test, y_pred)
+
+    bench.print_output(
+        library='sklearn',
+        algorithm='et_regr',
+        stages=['training', 'prediction'],
+        params=params,
+        functions=['et_regr.fit', 'et_regr.predict'],
+        times=[fit_time, predict_time],
+        metric_type=['rmse', 'r2_score'],
+        metrics=[[train_rmse, test_rmse], [train_r2, test_r2]],
+        data=[X_train, X_test],
+        alg_instance=regr,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='scikit-learn extra trees '
+                                     'regression benchmark')
+
+    parser.add_argument('--criterion', type=str, default='mse',
+                        choices=('mse', 'mae'),
+                        help='The function to measure the quality of a split')
+    parser.add_argument('--num-trees', type=int, default=100,
+                        help='Number of trees in the forest')
+    parser.add_argument('--max-features', type=bench.float_or_int_or_str, default=None,
+                        help='Upper bound on features used at each split')
+    parser.add_argument('--max-depth', type=int, default=None,
+                        help='Upper bound on depth of constructed trees')
+    parser.add_argument('--min-samples-split', type=bench.float_or_int, default=2,
+                        help='Minimum samples number for node splitting')
+    parser.add_argument('--max-leaf-nodes', type=int, default=None,
+                        help='Grow trees with max_leaf_nodes in best-first fashion'
+                        'if it is not None')
+    parser.add_argument('--min-impurity-decrease', type=float, default=0.,
+                        help='Needed impurity decrease for node splitting')
+    parser.add_argument('--no-bootstrap', dest='bootstrap', default=False,
+                        action='store_false', help="Don't control bootstraping")
+
+    params = bench.parse_args(parser)
+    bench.run_with_context(params, main)

From b60b68cf41521b29b7fabe440d7fda5110278b25 Mon Sep 17 00:00:00 2001
From: icfaust <icfaust@gmail.com>
Date: Mon, 25 Sep 2023 03:42:18 -0700
Subject: [PATCH 2/6] change readmes for et

---
 README.md               |  6 ++++--
 sklearn_bench/README.md | 29 +++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5eef0bc2e..a7b2e22b6 100755
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ We publish blogs on Medium, so [follow us](https://medium.com/intel-analytics-so
 - [How to create conda environment for benchmarking](#how-to-create-conda-environment-for-benchmarking)
 - [Running Python benchmarks with runner script](#running-python-benchmarks-with-runner-script)
 - [Benchmark supported algorithms](#benchmark-supported-algorithms)
-  - [Scikit-learn benchmakrs](#scikit-learn-benchmakrs)
+- [Scikit-learn benchmarks](#scikit-learn-benchmarks)
 - [Algorithm parameters](#algorithm-parameters)
 
 ## How to create conda environment for benchmarking
@@ -105,6 +105,8 @@ The configuration of benchmarks allows you to select the frameworks to run, sele
 |**[DBSCAN](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html)**|dbscan|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|
 |**[RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)**|df_clfs|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:|
 |**[RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)**|df_regr|:white_check_mark:|:x:|:white_check_mark:|:white_check_mark:|:x:|
+|**[ExtraTreesClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html)**|et_clfs|:white_check_mark:|:x:|:x:|:x:|:x:|
+|**[ExtraTreesRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html)**|et_regr|:white_check_mark:|:x:|:x:|:x:|:x:|
 |**[pairwise_distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html)**|distances|:white_check_mark:|:x:|:white_check_mark:|:x:|:x:|
 |**[KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)**|kmeans|:white_check_mark:|:white_check_mark:|:white_check_mark:|:white_check_mark:|:x:|
 |**[KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)**|knn_clsf|:white_check_mark:|:x:|:x:|:white_check_mark:|:x:|
@@ -118,7 +120,7 @@ The configuration of benchmarks allows you to select the frameworks to run, sele
 |**[GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:|
 |**[GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:|
 
-### Scikit-learn benchmakrs
+### Scikit-learn benchmarks
 
 When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension.
 
diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md
index bde10c3f9..7d70de27a 100644
--- a/sklearn_bench/README.md
+++ b/sklearn_bench/README.md
@@ -16,6 +16,8 @@ You can launch benchmarks for each algorithm separately. The tables below list a
 - [DBSCAN](#dbscan)
 - [RandomForestClassifier](#randomforestclassifier)
 - [RandomForestRegressor](#randomforestregressor)
+- [ExtraTreesClassifier](#extratreesclassifier)
+- [ExtraTreesRegressor](#extratreesregressor)
 - [pairwise_distances](#pairwise_distances)
 - [KMeans](#kmeans)
 - [KNeighborsClassifier](#kneighborsclassifier)
@@ -85,6 +87,33 @@ You can launch benchmarks for each algorithm separately. The tables below list a
 | no-bootstrap | action | True | Don't control bootstraping |
 | use-sklearn-class | action |  | Force use of sklearn.ensemble.RandomForestClassifier |
 
+### ExtraTreesClassifier
+
+| parameter Name  | Type | default value | description |
+| ----- | ---- |---- |---- |
+| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split |
+| num-trees | int | 100 | The number of trees in the forest |
+| max-features | float_or_int | None | Upper bound on features used at each split |
+| max-depth | int | None | Upper bound on depth of constructed trees |
+| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting |
+| max-leaf-nodes | int | None | Maximum leaf nodes per tree |
+| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting |
+| no-bootstrap | store_false | False | Don't control bootstraping |
+
+### ExtraTreesRegressor
+
+| parameter Name  | Type | default value | description |
+| ----- | ---- |---- |---- |
+| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split |
+| num-trees | int | 100 | The number of trees in the forest |
+| max-features | float_or_int | None | Upper bound on features used at each split |
+| max-depth | int | None | Upper bound on depth of constructed trees |
+| min-samples-split | float_or_int | 2 | Minimum samples number for node splitting |
+| max-leaf-nodes | int | None | Maximum leaf nodes per tree |
+| min-impurity-decrease | float | 0 | Needed impurity decrease for node splitting |
+| no-bootstrap | action | False | Don't control bootstraping |
+| use-sklearn-class | action |  | Force use of sklearn.ensemble.ExtraTreesClassifier |
+
 ### pairwise_distances
 
 | parameter Name  | Type | default value | description |

From 4839a3d162b8b542ddf7e94916b4d039fdcff1a8 Mon Sep 17 00:00:00 2001
From: icfaust <icfaust@gmail.com>
Date: Mon, 25 Sep 2023 03:44:53 -0700
Subject: [PATCH 3/6] correcting mistakes in master

---
 sklearn_bench/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md
index 7d70de27a..8bdc3c29e 100644
--- a/sklearn_bench/README.md
+++ b/sklearn_bench/README.md
@@ -77,7 +77,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a
 
 | parameter Name  | Type | default value | description |
 | ----- | ---- |---- |---- |
-| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split |
+| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split |
 | num-trees | int | 100 | The number of trees in the forest |
 | max-features | float_or_int | None | Upper bound on features used at each split |
 | max-depth | int | None | Upper bound on depth of constructed trees |
@@ -104,7 +104,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a
 
 | parameter Name  | Type | default value | description |
 | ----- | ---- |---- |---- |
-| criterion | str | gini | *gini* or *entropy*. The function to measure the quality of a split |
+| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split |
 | num-trees | int | 100 | The number of trees in the forest |
 | max-features | float_or_int | None | Upper bound on features used at each split |
 | max-depth | int | None | Upper bound on depth of constructed trees |

From 04114e99bc4c14aa70eccf18cd4deec2425f6115 Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 25 Sep 2023 12:58:20 +0200
Subject: [PATCH 4/6] Update sklearn.json

---
 configs/testing/sklearn.json | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/configs/testing/sklearn.json b/configs/testing/sklearn.json
index f114ef793..83b1e5133 100644
--- a/configs/testing/sklearn.json
+++ b/configs/testing/sklearn.json
@@ -57,6 +57,42 @@
                 }
             ],
             "num-trees": 10
+        },
+                {
+            "algorithm": "et_clsf",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 5,
+                    "n_features": 10,
+                    "training": {
+                        "n_samples": 1000
+                    },
+                    "testing": {
+                        "n_samples": 20
+                    }
+                }
+            ],
+            "num-trees": 10
+        },
+        {
+            "algorithm": "et_regr",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "regression",
+                    "n_classes": 5,
+                    "n_features": 10,
+                    "training": {
+                        "n_samples": 100
+                    },
+                    "testing": {
+                        "n_samples": 20
+                    }
+                }
+            ],
+            "num-trees": 10
         },
         {
             "algorithm": "ridge",

From 09385d6f58ea40fab22f1d150403ca51490d6677 Mon Sep 17 00:00:00 2001
From: Ian Faust <icfaust@gmail.com>
Date: Mon, 25 Sep 2023 13:02:09 +0200
Subject: [PATCH 5/6] Update skl_config.json

---
 configs/skl_config.json | 72 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/configs/skl_config.json b/configs/skl_config.json
index f3f1fa93f..150735b00 100644
--- a/configs/skl_config.json
+++ b/configs/skl_config.json
@@ -182,6 +182,78 @@
                 }
             ]
         },
+        {
+            "algorithm": "et_clsf",
+            "dtype": "float32",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "data/higgs1m_x_train.npy",
+                        "y": "data/higgs1m_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs1m_x_test.npy",
+                        "y": "data/higgs1m_y_test.npy"
+                    }
+                },
+                {
+                    "source": "npy",
+                    "name": "airline-ohe",
+                    "training":
+                    {
+                        "x": "data/airline-ohe_x_train.npy",
+                        "y": "data/airline-ohe_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/airline-ohe_x_test.npy",
+                        "y": "data/airline-ohe_y_test.npy"
+                    }
+                }
+            ],
+            "num-trees": 50,
+            "max-depth": 16,
+            "max-leaf-nodes": 131072,
+            "max-features": 0.2
+        },
+        {
+            "algorithm": "et_regr",
+            "dtype": "float32",
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                },
+                {
+                    "source": "npy",
+                    "name": "airline_regression",
+                    "training":
+                    {
+                        "x": "data/airline_regression_x_train.npy",
+                        "y": "data/airline_regression_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/airline_regression_x_test.npy",
+                        "y": "data/airline_regression_y_test.npy"
+                    }
+                }
+            ]
+        },
         {
             "algorithm": "ridge",
             "dataset": [

From 092de2c23d47b680c0a99759a4f51a6ecbae4346 Mon Sep 17 00:00:00 2001
From: icfaust <icfaust@gmail.com>
Date: Mon, 25 Sep 2023 04:11:02 -0700
Subject: [PATCH 6/6] mse -> squared_error

---
 sklearn_bench/README.md  | 4 ++--
 sklearn_bench/df_regr.py | 4 ++--
 sklearn_bench/et_regr.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn_bench/README.md b/sklearn_bench/README.md
index 8bdc3c29e..a2353fd9c 100644
--- a/sklearn_bench/README.md
+++ b/sklearn_bench/README.md
@@ -77,7 +77,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a
 
 | parameter Name  | Type | default value | description |
 | ----- | ---- |---- |---- |
-| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split |
+| criterion | str | squared_error | *squared_error* or *absoulte_error*. The function to measure the quality of a split |
 | num-trees | int | 100 | The number of trees in the forest |
 | max-features | float_or_int | None | Upper bound on features used at each split |
 | max-depth | int | None | Upper bound on depth of constructed trees |
@@ -104,7 +104,7 @@ You can launch benchmarks for each algorithm separately. The tables below list a
 
 | parameter Name  | Type | default value | description |
 | ----- | ---- |---- |---- |
-| criterion | str | mse | *mse* or *mae*. The function to measure the quality of a split |
+| criterion | str | squared_error | *squared_error* or *absoulte_error*. The function to measure the quality of a split |
 | num-trees | int | 100 | The number of trees in the forest |
 | max-features | float_or_int | None | Upper bound on features used at each split |
 | max-depth | int | None | Upper bound on depth of constructed trees |
diff --git a/sklearn_bench/df_regr.py b/sklearn_bench/df_regr.py
index 4c7491af3..baa5bb475 100644
--- a/sklearn_bench/df_regr.py
+++ b/sklearn_bench/df_regr.py
@@ -67,8 +67,8 @@ def main():
     parser = argparse.ArgumentParser(description='scikit-learn random forest '
                                      'regression benchmark')
 
-    parser.add_argument('--criterion', type=str, default='mse',
-                        choices=('mse', 'mae'),
+    parser.add_argument('--criterion', type=str, default='squared_error',
+                        choices=('squared_error', 'absolute_error'),
                         help='The function to measure the quality of a split')
     parser.add_argument('--num-trees', type=int, default=100,
                         help='Number of trees in the forest')
diff --git a/sklearn_bench/et_regr.py b/sklearn_bench/et_regr.py
index b9bcef691..29bcc8f02 100644
--- a/sklearn_bench/et_regr.py
+++ b/sklearn_bench/et_regr.py
@@ -67,8 +67,8 @@ def main():
     parser = argparse.ArgumentParser(description='scikit-learn extra trees '
                                      'regression benchmark')
 
-    parser.add_argument('--criterion', type=str, default='mse',
-                        choices=('mse', 'mae'),
+    parser.add_argument('--criterion', type=str, default='squared_error',
+                        choices=('squared_error', 'absolute_error'),
                         help='The function to measure the quality of a split')
     parser.add_argument('--num-trees', type=int, default=100,
                         help='Number of trees in the forest')