From a2386c875e72508d4c7c43cedc34cfebfd0c62ba Mon Sep 17 00:00:00 2001
From: Alexander Andreev <alexander.andreev@intel.com>
Date: Thu, 31 Oct 2024 14:01:02 -0700
Subject: [PATCH] Initial support of user-provided datasets

---
 README.md                     |  2 +-
 sklbench/datasets/README.md   | 51 ++++++++++++++++++++++++++++-------
 sklbench/datasets/__init__.py | 15 ++++++++---
 sklbench/datasets/loaders.py  | 14 +++++++++-
 4 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 471dff2b..7a8c8078 100755
--- a/README.md
+++ b/README.md
@@ -100,6 +100,6 @@ flowchart TB
 - [Benchmarks Runner](sklbench/runner/README.md)
 - [Report Generator](sklbench/report/README.md)
 - [Benchmarks](sklbench/benchmarks/README.md)
-- [Data Processing](sklbench/datasets/README.md)
+- [Data Processing and Storage](sklbench/datasets/README.md)
 - [Emulators](sklbench/emulators/README.md)
 - [Developer Guide](docs/README.md)
diff --git a/sklbench/datasets/README.md b/sklbench/datasets/README.md
index 7f7cf9c2..8589a019 100644
--- a/sklbench/datasets/README.md
+++ b/sklbench/datasets/README.md
@@ -1,4 +1,4 @@
-# Data Handling in Benchmarks
+# Data Processing and Storage in Benchmarks
 
 Data handling steps:
 1. Load data:
@@ -7,6 +7,14 @@ Data handling steps:
 2. Split data into subsets if requested
 3. Convert to requested form (data type, format, order, etc.)
 
+Existing data sources:
+ - Synthetic data from sklearn
+ - OpenML datasets
+ - Custom loaders for named datasets
+ - User-provided datasets in compatible format
+
+## Data Caching
+
 There are two levels of caching with corresponding directories: `raw cache` for files downloaded from external sources, and just `cache` for files applicable for fast-loading in benchmarks.
 
 Each dataset has few associated files in usual `cache`: data component files (`x`, `y`, `weights`, etc.) and JSON file with dataset properties (number of classes, clusters, default split arguments).
@@ -21,16 +29,39 @@ data_cache/
 ```
 
 Cached file formats:
-| Format | File extension | Associated Python types |
-| --- | --- | --- |
-| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame |
-| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series |
-| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix |
+| Format | File extension | Associated Python types | Comment |
+| --- | --- | --- | --- |
+| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame |  |
+| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series | Data is stored under `arr_0` name |
+| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix | Data is stored under `data`, `indices` and `indptr` names |
 
-Existing data sources:
- - Synthetic data from sklearn
- - OpenML datasets
- - Custom loaders for named datasets
+## How to Modify Dataset for Compatibility with Scikit-learn_bench
+
+In order to reuse an existing dataset in scikit-learn_bench, you need to convert its file(s) into compatible format for dataset cache loader.
+
+Cached dataset consist of few files:
+- `{dataset name}.json` file which store required and optional dataset information
+- `{dataset name}_{data component name}.{data component extension}` files which store dataset components (data, labels, etc.)
+
+Example of `{dataset name}.json`:
+```json
+{"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 11}}
+```
+
+`n_classes` property in a dataset info file is *required* for classification datasets.
+
+Currently, `x` (data) and `y` (labels) are the only supported and *required* data components.
+
+Scikit-learn_bench-compatible dataset should be stored in `data:cache_directory` (`${PWD}/data_cache` or `{repository root}/data_cache` by default).
+
+You can specify created compatible dataset in config files the same way as datasets explicitly registered in scikit-learn_bench using its name:
+```json
+{
+    "data": {
+        "dataset": "{dataset name}"
+    }
+}
+```
 
 ---
 [Documentation tree](../../README.md#-documentation)
diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py
index be20420e..093875c4 100644
--- a/sklbench/datasets/__init__.py
+++ b/sklbench/datasets/__init__.py
@@ -22,6 +22,7 @@
 from ..utils.custom_types import BenchCase
 from .loaders import (
     dataset_loading_functions,
+    load_custom_data,
     load_openml_data,
     load_sklearn_synthetic_data,
 )
@@ -47,9 +48,17 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]:
     dataset = get_bench_case_value(bench_case, "data:dataset")
     if dataset is not None:
         dataset_params = get_bench_case_value(bench_case, "data:dataset_kwargs", dict())
-        return dataset_loading_functions[dataset](
-            **common_kwargs, preproc_kwargs=preproc_kwargs, dataset_params=dataset_params
-        )
+        if dataset in dataset_loading_functions:
+            # registered dataset loading branch
+            return dataset_loading_functions[dataset](
+                **common_kwargs,
+                preproc_kwargs=preproc_kwargs,
+                dataset_params=dataset_params,
+            )
+        else:
+            # user-provided dataset loading branch
+            return load_custom_data(**common_kwargs, preproc_kwargs=preproc_kwargs)
+
     # load by source
     source = get_bench_case_value(bench_case, "data:source")
     if source is not None:
diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py
index 0cc915f0..20df75b2 100644
--- a/sklbench/datasets/loaders.py
+++ b/sklbench/datasets/loaders.py
@@ -29,7 +29,7 @@
     make_regression,
 )
 
-from .common import cache, preprocess
+from .common import cache, load_data_description, load_data_from_cache, preprocess
 from .downloaders import (
     download_and_read_csv,
     download_kaggle_files,
@@ -84,6 +84,18 @@ def load_sklearn_synthetic_data(
     return {"x": x, "y": y}, data_desc
 
 
+@preprocess
+def load_custom_data(
+    data_name: str,
+    data_cache: str,
+    raw_data_cache: str,
+):
+    """Function to load data specified by user and stored in format compatible with scikit-learn_bench cache"""
+    return load_data_from_cache(data_cache, data_name), load_data_description(
+        data_cache, data_name
+    )
+
+
 """
 Classification datasets
 """