Skip to content

Commit c546e37

Browse files
Support incremental benchmarking of datasets larger than memory + final config/logic alignment (#180)
* Reduce config * Add covariance module to incremental config * Rename example config * Remove bs mentioning in config (need to be added later) * Fix num_batches and batch_size reading from config * Revert accidentally pushed changes * remove batch_size logic from incremental benchmarking for num_batches * Support incremental benchmarking of datasets larger than memory * black * fix logreg strong * align pca and knn bf16 configs * more knn alignment bf16 * minor followup --------- Co-authored-by: Kruglov, Oleg <oleg.kruglov@intel.com>
1 parent 2edb597 commit c546e37

File tree

6 files changed

+34
-60
lines changed

6 files changed

+34
-60
lines changed

configs/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ Configs have the three highest parameter keys:
117117
|:---------------|:--------------|:--------|:------------|
118118
| `algorithm`:`estimator` | None | | Name of measured estimator. |
119119
| `algorithm`:`estimator_params` | Empty `dict` | | Parameters for estimator constructor. |
120+
| `algorithm`:`num_batches`:`training` | 5 | | Number of batches to benchmark `partial_fit` function, using batches the size of number of samples specified (not samples divided by `num_batches`). For incremental estimators only. |
120121
| `algorithm`:`online_inference_mode` | False | | Enables online mode for inference methods of estimator (separate call for each sample). |
121122
| `algorithm`:`sklearn_context` | None | | Parameters for sklearn `config_context` used over estimator. |
122123
| `algorithm`:`sklearnex_context` | None | | Parameters for sklearnex `config_context` used over estimator. Updated by `sklearn_context` if set. |

configs/regular/bf16/knn.json

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"common knn parameters": {
55
"algorithm": {
66
"estimator_params": {
7-
"n_neighbors": [10, 100],
7+
"n_neighbors": 100,
88
"weights": "uniform"
99
}
1010
},
@@ -19,19 +19,10 @@
1919
"synthetic classification data": {
2020
"algorithm": {
2121
"estimator": "KNeighborsClassifier",
22-
"estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
22+
"estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": 2 }
2323
},
2424
"data": [
25-
{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }
26-
]
27-
},
28-
"synthetic regression data": {
29-
"algorithm": {
30-
"estimator": "KNeighborsRegressor",
31-
"estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] }
32-
},
33-
"data": [
34-
{ "source": "make_regression", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 100, "noise":1.5 } }
25+
{ "source": "make_classification", "split_kwargs": { "train_size": 50000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 51000, "n_features": 100, "n_classes": 2, "n_informative": "[SPECIAL_VALUE]0.5" } }
3526
]
3627
}
3728
},
@@ -43,14 +34,6 @@
4334
"sklearn knn parameters",
4435
"synthetic classification data"
4536
]
46-
},
47-
"sklearn brute knn reg": {
48-
"SETS": [
49-
"sklearn-ex[gpu] implementations",
50-
"common knn parameters",
51-
"sklearn knn parameters",
52-
"synthetic regression data"
53-
]
5437
}
5538
}
5639
}

configs/regular/bf16/pca.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
},
2121
"synthetic data": {
2222
"data": [
23-
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 10000000, "n_features": 10, "centers": 1 } }
23+
{ "source": "make_blobs", "generation_kwargs": { "n_samples": 3000000, "n_features": 10, "centers": 1 } }
2424
]
2525
}
2626
},

configs/spmd/large_scale/logreg_strong.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"logreg": {
2020
"SETS": [
2121
"sklearnex spmd implementation",
22-
"large scale strong 64 parameters",
22+
"large scale strong <=64 parameters",
2323
"spmd logreg parameters",
2424
"synthetic data",
2525
"spmd logreg2 parameters"

sklbench/benchmarks/sklearn_estimator.py

Lines changed: 26 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -324,41 +324,33 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
324324
return acceleration_lines > 0 and fallback_lines == 0
325325

326326

327-
def create_online_function(
328-
estimator_instance, method_instance, data_args, num_batches, batch_size
329-
):
327+
def create_online_function(estimator_instance, method_instance, data_args, num_batches):
330328

331329
if "y" in list(inspect.signature(method_instance).parameters):
332330

333331
def ndarray_function(x, y):
334332
for i in range(num_batches):
335-
method_instance(
336-
x[i * batch_size : (i + 1) * batch_size],
337-
y[i * batch_size : (i + 1) * batch_size],
338-
)
333+
method_instance(x, y)
339334
if hasattr(estimator_instance, "_onedal_finalize_fit"):
340335
estimator_instance._onedal_finalize_fit()
341336

342337
def dataframe_function(x, y):
343338
for i in range(num_batches):
344-
method_instance(
345-
x.iloc[i * batch_size : (i + 1) * batch_size],
346-
y.iloc[i * batch_size : (i + 1) * batch_size],
347-
)
339+
method_instance(x, y)
348340
if hasattr(estimator_instance, "_onedal_finalize_fit"):
349341
estimator_instance._onedal_finalize_fit()
350342

351343
else:
352344

353345
def ndarray_function(x):
354346
for i in range(num_batches):
355-
method_instance(x[i * batch_size : (i + 1) * batch_size])
347+
method_instance(x)
356348
if hasattr(estimator_instance, "_onedal_finalize_fit"):
357349
estimator_instance._onedal_finalize_fit()
358350

359351
def dataframe_function(x):
360352
for i in range(num_batches):
361-
method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
353+
method_instance(x)
362354
if hasattr(estimator_instance, "_onedal_finalize_fit"):
363355
estimator_instance._onedal_finalize_fit()
364356

@@ -413,28 +405,17 @@ def measure_sklearn_estimator(
413405
data_args = (x_train,)
414406
else:
415407
data_args = (x_test,)
408+
batch_size = get_bench_case_value(
409+
bench_case, f"algorithm:batch_size:{stage}"
410+
)
416411

417412
if method == "partial_fit":
418-
num_batches = get_bench_case_value(bench_case, "data:num_batches")
419-
batch_size = get_bench_case_value(bench_case, "data:batch_size")
420-
421-
if batch_size is None:
422-
if num_batches is None:
423-
num_batches = 5
424-
batch_size = (
425-
data_args[0].shape[0] + num_batches - 1
426-
) // num_batches
427-
if num_batches is None:
428-
num_batches = (
429-
data_args[0].shape[0] + batch_size - 1
430-
) // batch_size
413+
num_batches = get_bench_case_value(
414+
bench_case, f"algorithm:num_batches:{stage}", 5
415+
)
431416

432417
method_instance = create_online_function(
433-
estimator_instance,
434-
method_instance,
435-
data_args,
436-
num_batches,
437-
batch_size,
418+
estimator_instance, method_instance, data_args, num_batches
438419
)
439420
# daal4py model builders enabling branch
440421
if enable_modelbuilders and stage == "inference":
@@ -452,6 +433,10 @@ def measure_sklearn_estimator(
452433
metrics[method]["box filter mean[ms]"],
453434
metrics[method]["box filter std[ms]"],
454435
) = measure_case(bench_case, method_instance, *data_args)
436+
if batch_size is not None:
437+
metrics[method]["throughput[samples/ms]"] = (
438+
(data_args[0].shape[0] // batch_size) * batch_size
439+
) / metrics[method]["time[ms]"]
455440
if ensure_sklearnex_patching:
456441
full_method_name = f"{estimator_class.__name__}.{method}"
457442
sklearnex_logging_stream.seek(0)
@@ -559,9 +544,16 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
559544
for stage in estimator_methods.keys():
560545
data_descs[stage].update(
561546
{
562-
"batch_size": get_bench_case_value(
563-
bench_case, f"algorithm:batch_size:{stage}"
564-
)
547+
key: val
548+
for key, val in {
549+
"batch_size": get_bench_case_value(
550+
bench_case, f"algorithm:batch_size:{stage}"
551+
),
552+
"num_batches": get_bench_case_value(
553+
bench_case, f"algorithm:num_batches:{stage}"
554+
),
555+
}.items()
556+
if val is not None
565557
}
566558
)
567559
if "n_classes" in data_description:

sklbench/report/implementation.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
"order",
9898
"n_classes",
9999
"n_clusters",
100+
"num_batches",
100101
"batch_size",
101102
]
102103

@@ -262,10 +263,7 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
262263
# only relative improvements are included in summary currently
263264
if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
264265
metric_columns.append(column)
265-
if metric_columns:
266-
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
267-
else:
268-
summary = pd.DataFrame()
266+
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
269267
summary.index = pd.Index([df_name])
270268
return summary
271269

0 commit comments

Comments
 (0)