Update CI and minor code rework

Alexsandruss · Alexsandruss · commit 1c6bd6693f3a · 2024-05-21T10:00:24.000-07:00
diff --git a/configs/regular/pca.json b/configs/regular/pca.json
@@ -46,7 +46,7 @@
     "TEMPLATES": {
         "sklearn pca": {
             "SETS": [
-                "sklearn-ex[preview] implementations",
+                "sklearn-ex[cpu,gpu] implementations",
                 "pca parameters",
                 "pca datasets"
             ]
diff --git a/configs/testing/ci.json b/configs/testing/ci.json
@@ -0,0 +1,119 @@
+{
+    "INCLUDE": ["../common/sklearn.json"],
+    "PARAMETERS_SETS": {
+        "common parameters": {
+            "data": {
+                "format": ["numpy", "pandas"],
+                "dtype": ["float32", "float64"],
+                "order": ["C", "F"],
+                "split_kwargs": {
+                    "train_size": 400,
+                    "test_size": 100,
+                    "shuffle": true,
+                    "random_state": 42
+                },
+                "preprocessing_kwargs": {
+                    "normalize": true
+                }
+            },
+            "bench": { "n_runs": 5 },
+            "algorithm": { "device": "default" }
+        },
+        "datasets": { 
+            "data":
+            [
+                {
+                    "source": "fetch_openml",
+                    "id": 1430
+                },
+                {
+                    "source": "make_classification",
+                    "generation_kwargs": {
+                        "n_classes": 2,
+                        "n_samples": 500,
+                        "n_features": 16,
+                        "n_informative": "[SPECIAL_VALUE]0.5"
+                    }
+                }
+            ]
+        },
+        "algorithms": [
+            {
+                "algorithm": {
+                    "estimator": "DBSCAN",
+                    "estimator_params": { "algorithm": "brute" }
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": "KMeans",
+                    "estimator_params": { "init": "random", "algorithm": "lloyd" }
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": "PCA",
+                    "estimator_params": { "svd_solver": "full" }
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": "TSNE"
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": [
+                        "RandomForestClassifier", "ExtraTreesClassifier",
+                        "RandomForestRegressor", "ExtraTreesRegressor"
+                    ],
+                    "estimator_params": { "n_estimators": 20 }
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": [
+                        "KNeighborsClassifier", "KNeighborsRegressor"
+                    ],
+                    "estimator_params": { "algorithm": ["brute", "kd_tree"] }
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": ["LinearRegression", "Ridge", "Lasso", "ElasticNet"]
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": ["SVC", "SVR"]
+                }
+            },
+            {
+                "algorithm": {
+                    "estimator": ["NuSVC", "NuSVR"],
+                    "estimator_params": { "nu": 0.1 }
+                }
+            },
+            {
+                "algorithm": {
+                    "function": "train_test_split",
+                    "args_order": "x_train|y_train",
+                    "kwargs": {
+                        "random_state": 42,
+                        "shuffle": true
+                    }
+                }
+            }
+        ]
+    },
+    "TEMPLATES": {
+        "test": {
+            "SETS": [
+                "common parameters",
+                "datasets",
+                "sklearn-ex[cpu] implementations",
+                "algorithms"
+            ]
+        }
+    }
+}
diff --git a/envs/requirements-sklearn.txt b/envs/requirements-sklearn.txt
@@ -12,3 +12,6 @@ xgboost
 catboost
 lightgbm
 scikit-learn-intelex
+# oneapi components
+dpctl
+dpnp
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -40,7 +40,7 @@
 from ..datasets import load_data
 from ..datasets.transformer import split_and_transform_data
 from ..utils.bench_case import get_bench_case_value, get_data_name
-from ..utils.common import convert_to_ndarray, custom_format, get_module_members
+from ..utils.common import convert_to_numpy, custom_format, get_module_members
 from ..utils.config import bench_case_filter
 from ..utils.custom_types import BenchCase, Numeric, NumpyNumeric
 from ..utils.logger import logger
@@ -121,7 +121,7 @@ def get_subset_metrics_of_estimator(
     metrics = dict()
     # Note: use data[0, 1] when calling estimator methods,
     # x, y are numpy ndarrays for compatibility with sklearn metrics
-    x, y = list(map(convert_to_ndarray, data))
+    x, y = list(map(convert_to_numpy, data))
     if stage == "training":
         if hasattr(estimator_instance, "n_iter_"):
             iterations = estimator_instance.n_iter_
@@ -134,7 +134,7 @@ def get_subset_metrics_of_estimator(
             ):
                 metrics.update({"iterations": int(iterations[0])})
     if task == "classification":
-        y_pred = convert_to_ndarray(estimator_instance.predict(x))
+        y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
             {
                 "accuracy": float(accuracy_score(y, y_pred)),
@@ -145,7 +145,7 @@ def get_subset_metrics_of_estimator(
             hasattr(estimator_instance, "probability")
             and getattr(estimator_instance, "probability") == False
         ):
-            y_pred_proba = convert_to_ndarray(estimator_instance.predict_proba(x))
+            y_pred_proba = convert_to_numpy(estimator_instance.predict_proba(x))
             metrics.update(
                 {
                     "ROC AUC": float(
@@ -163,7 +163,7 @@ def get_subset_metrics_of_estimator(
                 }
             )
     elif task == "regression":
-        y_pred = convert_to_ndarray(estimator_instance.predict(x))
+        y_pred = convert_to_numpy(estimator_instance.predict(x))
         metrics.update(
             {
                 "RMSE": float(mean_squared_error(y, y_pred) ** 0.5),
@@ -194,16 +194,14 @@ def get_subset_metrics_of_estimator(
                 {
                     "inertia": float(
                         np.power(
-                            convert_to_ndarray(estimator_instance.transform(x)).min(
-                                axis=1
-                            ),
+                            convert_to_numpy(estimator_instance.transform(x)).min(axis=1),
                             2,
                         ).sum()
                     )
                 }
             )
         if hasattr(estimator_instance, "predict"):
-            y_pred = convert_to_ndarray(estimator_instance.predict(x))
+            y_pred = convert_to_numpy(estimator_instance.predict(x))
             metrics.update(
                 {
                     "Davies-Bouldin score": float(davies_bouldin_score(x, y_pred)),
@@ -212,7 +210,7 @@ def get_subset_metrics_of_estimator(
                 }
             )
         if "DBSCAN" in str(estimator_instance) and stage == "training":
-            labels = convert_to_ndarray(estimator_instance.labels_)
+            labels = convert_to_numpy(estimator_instance.labels_)
             clusters = len(np.unique(labels[labels != -1]))
             metrics.update({"clusters": clusters})
             if clusters > 1:
@@ -245,7 +243,7 @@ def get_subset_metrics_of_estimator(
             ground_truth_neighbors = _brute_knn.kneighbors(
                 x, recall_degree, return_distance=False
             )
-            predicted_neighbors = convert_to_ndarray(
+            predicted_neighbors = convert_to_numpy(
                 estimator_instance.kneighbors(
                     data[0], recall_degree, return_distance=False
                 )
diff --git a/sklbench/datasets/transformer.py b/sklbench/datasets/transformer.py
@@ -15,6 +15,7 @@
 # ===============================================================================
 
 import os
+
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
diff --git a/sklbench/emulators/common/__init__.py b/sklbench/emulators/common/__init__.py
@@ -16,5 +16,4 @@
 
 from .neighbors import NearestNeighborsBase
 
-
 __all__ = ["NearestNeighborsBase"]
diff --git a/sklbench/emulators/common/neighbors.py b/sklbench/emulators/common/neighbors.py
@@ -16,6 +16,7 @@
 
 
 from warnings import warn
+
 import numpy as np
 
 
diff --git a/sklbench/emulators/faiss/__init__.py b/sklbench/emulators/faiss/__init__.py
@@ -16,5 +16,4 @@
 
 from .neighbors import NearestNeighbors
 
-
 __all__ = ["NearestNeighbors"]
diff --git a/sklbench/emulators/faiss/neighbors.py b/sklbench/emulators/faiss/neighbors.py
@@ -16,6 +16,7 @@
 
 
 import faiss
+
 from ..common import NearestNeighborsBase
 
 
diff --git a/sklbench/emulators/raft/__init__.py b/sklbench/emulators/raft/__init__.py
@@ -16,5 +16,4 @@
 
 from .neighbors import NearestNeighbors
 
-
 __all__ = ["NearestNeighbors"]
diff --git a/sklbench/emulators/raft/neighbors.py b/sklbench/emulators/raft/neighbors.py
@@ -16,7 +16,8 @@
 
 import cupy as cp
 from pylibraft.common import DeviceResources
-from pylibraft.neighbors import brute_force, ivf_flat, ivf_pq, cagra
+from pylibraft.neighbors import brute_force, cagra, ivf_flat, ivf_pq
+
 from ..common import NearestNeighborsBase
 
 
diff --git a/sklbench/emulators/svs/__init__.py b/sklbench/emulators/svs/__init__.py
@@ -16,5 +16,4 @@
 
 from .neighbors import NearestNeighbors
 
-
 __all__ = ["NearestNeighbors"]
diff --git a/sklbench/emulators/svs/neighbors.py b/sklbench/emulators/svs/neighbors.py
@@ -16,6 +16,7 @@
 
 import pysvs
 from psutil import cpu_count
+
 from ..common.neighbors import NearestNeighborsBase
 
 
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
@@ -360,3 +360,4 @@ def generate_report(args: argparse.Namespace):
     # remove default sheet
     wb.remove(wb["Sheet"])
     wb.save(args.report_file)
+    return 0
diff --git a/sklbench/runner/arguments.py b/sklbench/runner/arguments.py
@@ -22,32 +22,31 @@
 from ..report import add_report_generator_arguments
 
 
-def get_argument_actions(parser):
-    arg_actions = []
-
-    for action in parser._actions:
-        if isinstance(action, argparse._ArgumentGroup):
-            for subaction in action._group_actions:
-                arg_actions.append(subaction)
-        else:
-            arg_actions.append(action)
-    return arg_actions
+def get_parser_description(parser: argparse.ArgumentParser):
+    """Convert parser description to Markdown-style table."""
 
+    def get_argument_actions(parser):
+        arg_actions = []
 
-def parse_action(action: argparse.Action) -> Dict:
-    return {
-        "Name": "</br>".join(map(lambda x: f"`{x}`", action.option_strings)),
-        "Type": action.type.__name__ if action.type is not None else None,
-        "Default value": (
-            action.default if action.default is not argparse.SUPPRESS else None
-        ),
-        "Choices": action.choices,
-        "Description": action.help,
-    }
+        for action in parser._actions:
+            if isinstance(action, argparse._ArgumentGroup):
+                for subaction in action._group_actions:
+                    arg_actions.append(subaction)
+            else:
+                arg_actions.append(action)
+        return arg_actions
 
+    def parse_action(action: argparse.Action) -> Dict:
+        return {
+            "Name": "</br>".join(map(lambda x: f"`{x}`", action.option_strings)),
+            "Type": action.type.__name__ if action.type is not None else None,
+            "Default value": (
+                action.default if action.default is not argparse.SUPPRESS else None
+            ),
+            "Choices": action.choices,
+            "Description": action.help,
+        }
 
-def get_parser_description(parser: argparse.ArgumentParser):
-    """Convert parser description to Markdown-style table."""
     return pd.DataFrame(map(parse_action, get_argument_actions(parser))).to_markdown(
         index=False
     )
@@ -76,7 +75,7 @@ def add_runner_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
         type=str,
         choices=("ERROR", "WARNING", "INFO", "DEBUG"),
         help="Global logging level for benchmarks: "
-        "overwrites runner, bench and report log levels.",
+        "overwrites runner, bench and report logging levels.",
     )
     # benchmarking cases finding, overwriting and filtering
     parser.add_argument(
@@ -106,7 +105,7 @@ def add_runner_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
         default="",
         type=str,
         nargs="+",
-        help="Filters benchmark bench_cases by config parameters. "
+        help="Filters benhcmarking cases by parameter values. "
         "For example: `-f data:dtype=float32 data:order=F`.",
     )
 
@@ -120,6 +119,7 @@ def add_runner_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentPa
     parser.add_argument(
         "--environment-alias",
         "--env-alias",
+        "-e",
         type=str,
         default=None,
         help="Environment alias to use instead of it's configuration hash.",
diff --git a/sklbench/runner/benchmark_commands.py b/sklbench/runner/benchmark_commands.py
@@ -87,15 +87,16 @@ def run_benchmark_from_case(
     bench_case: BenchCase, filters: List[BenchCase], log_level: str
 ) -> Tuple[int, List[Dict]]:
     command = generate_benchmark_command(bench_case, filters, log_level)
-    logger.debug(f"Benchmark wrapper call command: {command}")
+    logger.debug(f"Benchmark wrapper call command:\n{command}")
     return_code, stdout, stderr = read_output_from_command(command)
 
     # filter stdout warnings
+    prefixes_to_skip = ["[W]", "[I]"]
     stdout = "\n".join(
         [
             line
             for line in stdout.split("\n")
-            if not (line.startswith("[W]") or line.startswith("[I]"))
+            if not any(map(lambda x: line.startswith(x), prefixes_to_skip))
         ]
     )
 
diff --git a/sklbench/runner/implementation.py b/sklbench/runner/implementation.py
diff --git a/sklbench/utils/common.py b/sklbench/utils/common.py
diff --git a/sklbench/utils/special_params.py b/sklbench/utils/special_params.py
diff --git a/test-configuration.yml b/test-configuration.yml

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`"TEMPLATES": {`
`47`	`47`	`"sklearn pca": {`
`48`	`48`	`"SETS": [`
`49`		`- "sklearn-ex[preview] implementations",`
	`49`	`+ "sklearn-ex[cpu,gpu] implementations",`
`50`	`50`	`"pca parameters",`
`51`	`51`	`"pca datasets"`
`52`	`52`	`]`
Original file line number	Diff line number	Diff line change
`@@ -16,5 +16,4 @@`
`16`	`16`
`17`	`17`	`from .neighbors import NearestNeighborsBase`
`18`	`18`
`19`		`-`
`20`	`19`	`__all__ = ["NearestNeighborsBase"]`
Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`
`17`	`17`
`18`	`18`	`from warnings import warn`
	`19`	`+`
`19`	`20`	`import numpy as np`
`20`	`21`
`21`	`22`
Original file line number	Diff line number	Diff line change
`@@ -87,15 +87,16 @@ def run_benchmark_from_case(`
`87`	`87`	`bench_case: BenchCase, filters: List[BenchCase], log_level: str`
`88`	`88`	`) -> Tuple[int, List[Dict]]:`
`89`	`89`	`command = generate_benchmark_command(bench_case, filters, log_level)`
`90`		`- logger.debug(f"Benchmark wrapper call command: {command}")`
	`90`	`+ logger.debug(f"Benchmark wrapper call command:\n{command}")`
`91`	`91`	`return_code, stdout, stderr = read_output_from_command(command)`
`92`	`92`
`93`	`93`	`# filter stdout warnings`
	`94`	`+ prefixes_to_skip = ["[W]", "[I]"]`
`94`	`95`	`stdout = "\n".join(`
`95`	`96`	`[`
`96`	`97`	`line`
`97`	`98`	`for line in stdout.split("\n")`
`98`		`- if not (line.startswith("[W]") or line.startswith("[I]"))`
	`99`	`+ if not any(map(lambda x: line.startswith(x), prefixes_to_skip))`
`99`	`100`	`]`
`100`	`101`	`)`
`101`	`102`