Support incremental benchmarking of datasets larger than memory + final config/logic alignment (#180)

ethanglaser · olegkkruglov · web-flow · commit c546e3785579 · 2025-04-24T21:21:38.000-07:00
* Reduce config

* Add covariance module to incremental config

* Rename example config

* Remove bs mentioning in config (need to be added later)

* Fix num_batches and batch_size reading from config

* Revert accidentally pushed changes

* remove batch_size logic from incremental benchmarking for num_batches

* Support incremental benchmarking of datasets larger than memory

* black

* fix logreg strong

* align pca and knn bf16 configs

* more knn alignment bf16

* minor followup

---------

Co-authored-by: Kruglov, Oleg &lt;oleg.kruglov@intel.com&gt;
diff --git a/configs/README.md b/configs/README.md
@@ -117,6 +117,7 @@ Configs have the three highest parameter keys:
 |:---------------|:--------------|:--------|:------------|
 | `algorithm`:`estimator` | None |  | Name of measured estimator. |
 | `algorithm`:`estimator_params` | Empty `dict` |  | Parameters for estimator constructor. |
+| `algorithm`:`num_batches`:`training` | 5 |  | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. |
 | `algorithm`:`online_inference_mode` | False |  | Enables online mode for inference methods of estimator (separate call for each sample). |
 | `algorithm`:`sklearn_context` | None |  | Parameters for sklearn `config_context` used over estimator. |
 | `algorithm`:`sklearnex_context` | None |  | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. |
diff --git a/configs/regular/bf16/knn.json b/configs/regular/bf16/knn.json
@@ -4,7 +4,7 @@
         "common knn parameters": {
             "algorithm": {
                 "estimator_params": {
-                    "n_neighbors": [10, 100],
+                    "n_neighbors": 100,
                     "weights": "uniform"
                 }
             },
@@ -19,19 +19,10 @@
         "synthetic classification data": {
 	    "algorithm": {
                 "estimator": "KNeighborsClassifier",
-                "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
+                "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": 2 }
             },
             "data": [
-		        { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 5001000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
-            ]
-        },
-        "synthetic regression data": {
-	    "algorithm": {
-                "estimator": "KNeighborsRegressor",
-                "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
-            },
-            "data": [
-		        { "source": "make_regression", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 5001000,  "n_features": 100, "noise":1.5 } }
+		        { "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 },   "generation_kwargs": {  "n_samples": 51000,  "n_features": 100, "n_classes": 2,  "n_informative": "[SPECIAL_VALUE]0.5" } }
             ]
         }
     },
@@ -43,14 +34,6 @@
                 "sklearn knn parameters",
                 "synthetic classification data"
             ]
-        },
-	"sklearn brute knn reg": {
-            "SETS": [
-                "sklearn-ex[gpu] implementations",
-                "common knn parameters",
-                "sklearn knn parameters",
-                "synthetic regression data"
-            ]
         }
     }
 }
diff --git a/configs/regular/bf16/pca.json b/configs/regular/bf16/pca.json
@@ -20,7 +20,7 @@
         },
         "synthetic data": {
             "data": [
-                { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000,  "n_features": 10, "centers": 1 } }
+                { "source": "make_blobs", "generation_kwargs": { "n_samples": 3000000,  "n_features": 10, "centers": 1 } }
             ]
         }
     },
diff --git a/configs/spmd/large_scale/logreg_strong.json b/configs/spmd/large_scale/logreg_strong.json
@@ -19,7 +19,7 @@
         "logreg": {
             "SETS": [
                 "sklearnex spmd implementation",
-                "large scale strong 64 parameters",
+                "large scale strong <=64 parameters",
                 "spmd logreg parameters",
                 "synthetic data",
                 "spmd logreg2 parameters"
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -324,41 +324,33 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
     return acceleration_lines > 0 and fallback_lines == 0
 
 
-def create_online_function(
-    estimator_instance, method_instance, data_args, num_batches, batch_size
-):
+def create_online_function(estimator_instance, method_instance, data_args, num_batches):
 
     if "y" in list(inspect.signature(method_instance).parameters):
 
         def ndarray_function(x, y):
             for i in range(num_batches):
-                method_instance(
-                    x[i * batch_size : (i + 1) * batch_size],
-                    y[i * batch_size : (i + 1) * batch_size],
-                )
+                method_instance(x, y)
             if hasattr(estimator_instance, "_onedal_finalize_fit"):
                 estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x, y):
             for i in range(num_batches):
-                method_instance(
-                    x.iloc[i * batch_size : (i + 1) * batch_size],
-                    y.iloc[i * batch_size : (i + 1) * batch_size],
-                )
+                method_instance(x, y)
             if hasattr(estimator_instance, "_onedal_finalize_fit"):
                 estimator_instance._onedal_finalize_fit()
 
     else:
 
         def ndarray_function(x):
             for i in range(num_batches):
-                method_instance(x[i * batch_size : (i + 1) * batch_size])
+                method_instance(x)
             if hasattr(estimator_instance, "_onedal_finalize_fit"):
                 estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x):
             for i in range(num_batches):
-                method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
+                method_instance(x)
             if hasattr(estimator_instance, "_onedal_finalize_fit"):
                 estimator_instance._onedal_finalize_fit()
 
@@ -413,28 +405,17 @@ def measure_sklearn_estimator(
                         data_args = (x_train,)
                     else:
                         data_args = (x_test,)
+                batch_size = get_bench_case_value(
+                    bench_case, f"algorithm:batch_size:{stage}"
+                )
 
                 if method == "partial_fit":
-                    num_batches = get_bench_case_value(bench_case, "data:num_batches")
-                    batch_size = get_bench_case_value(bench_case, "data:batch_size")
-
-                    if batch_size is None:
-                        if num_batches is None:
-                            num_batches = 5
-                        batch_size = (
-                            data_args[0].shape[0] + num_batches - 1
-                        ) // num_batches
-                    if num_batches is None:
-                        num_batches = (
-                            data_args[0].shape[0] + batch_size - 1
-                        ) // batch_size
+                    num_batches = get_bench_case_value(
+                        bench_case, f"algorithm:num_batches:{stage}", 5
+                    )
 
                     method_instance = create_online_function(
-                        estimator_instance,
-                        method_instance,
-                        data_args,
-                        num_batches,
-                        batch_size,
+                        estimator_instance, method_instance, data_args, num_batches
                     )
                 # daal4py model builders enabling branch
                 if enable_modelbuilders and stage == "inference":
@@ -452,6 +433,10 @@ def measure_sklearn_estimator(
                     metrics[method]["box filter mean[ms]"],
                     metrics[method]["box filter std[ms]"],
                 ) = measure_case(bench_case, method_instance, *data_args)
+                if batch_size is not None:
+                    metrics[method]["throughput[samples/ms]"] = (
+                        (data_args[0].shape[0] // batch_size) * batch_size
+                    ) / metrics[method]["time[ms]"]
                 if ensure_sklearnex_patching:
                     full_method_name = f"{estimator_class.__name__}.{method}"
                     sklearnex_logging_stream.seek(0)
@@ -559,9 +544,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
     for stage in estimator_methods.keys():
         data_descs[stage].update(
             {
-                "batch_size": get_bench_case_value(
-                    bench_case, f"algorithm:batch_size:{stage}"
-                )
+                key: val
+                for key, val in {
+                    "batch_size": get_bench_case_value(
+                        bench_case, f"algorithm:batch_size:{stage}"
+                    ),
+                    "num_batches": get_bench_case_value(
+                        bench_case, f"algorithm:num_batches:{stage}"
+                    ),
+                }.items()
+                if val is not None
             }
         )
         if "n_classes" in data_description:
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
@@ -97,6 +97,7 @@
     "order",
     "n_classes",
     "n_clusters",
+    "num_batches",
     "batch_size",
 ]
 
@@ -262,10 +263,7 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
             # only relative improvements are included in summary currently
             if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
                 metric_columns.append(column)
-    if metric_columns:
-        summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
-    else:
-        summary = pd.DataFrame()
+    summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
     summary.index = pd.Index([df_name])
     return summary
 

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`},`
`21`	`21`	`"synthetic data": {`
`22`	`22`	`"data": [`
`23`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }`
	`23`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 3000000, "n_features": 10, "centers": 1 } }`
`24`	`24`	`]`
`25`	`25`	`}`
`26`	`26`	`},`