From a2386c875e72508d4c7c43cedc34cfebfd0c62ba Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 31 Oct 2024 14:01:02 -0700 Subject: [PATCH] Initial support of user-provided datasets --- README.md | 2 +- sklbench/datasets/README.md | 51 ++++++++++++++++++++++++++++------- sklbench/datasets/__init__.py | 15 ++++++++--- sklbench/datasets/loaders.py | 14 +++++++++- 4 files changed, 67 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 471dff2b..7a8c8078 100755 --- a/README.md +++ b/README.md @@ -100,6 +100,6 @@ flowchart TB - [Benchmarks Runner](sklbench/runner/README.md) - [Report Generator](sklbench/report/README.md) - [Benchmarks](sklbench/benchmarks/README.md) -- [Data Processing](sklbench/datasets/README.md) +- [Data Processing and Storage](sklbench/datasets/README.md) - [Emulators](sklbench/emulators/README.md) - [Developer Guide](docs/README.md) diff --git a/sklbench/datasets/README.md b/sklbench/datasets/README.md index 7f7cf9c2..8589a019 100644 --- a/sklbench/datasets/README.md +++ b/sklbench/datasets/README.md @@ -1,4 +1,4 @@ -# Data Handling in Benchmarks +# Data Processing and Storage in Benchmarks Data handling steps: 1. Load data: @@ -7,6 +7,14 @@ Data handling steps: 2. Split data into subsets if requested 3. Convert to requested form (data type, format, order, etc.) +Existing data sources: + - Synthetic data from sklearn + - OpenML datasets + - Custom loaders for named datasets + - User-provided datasets in compatible format + +## Data Caching + There are two levels of caching with corresponding directories: `raw cache` for files downloaded from external sources, and just `cache` for files applicable for fast-loading in benchmarks. Each dataset has few associated files in usual `cache`: data component files (`x`, `y`, `weights`, etc.) and JSON file with dataset properties (number of classes, clusters, default split arguments). @@ -21,16 +29,39 @@ data_cache/ ``` Cached file formats: -| Format | File extension | Associated Python types | -| --- | --- | --- | -| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame | -| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series | -| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix | +| Format | File extension | Associated Python types | Comment | +| --- | --- | --- | --- | +| [Parquet](https://parquet.apache.org) | `.parq` | pandas.DataFrame | | +| Numpy uncompressed binary dense data | `.npz` | numpy.ndarray, pandas.Series | Data is stored under `arr_0` name | +| Numpy uncompressed binary CSR data | `.csr.npz` | scipy.sparse.csr_matrix | Data is stored under `data`, `indices` and `indptr` names | -Existing data sources: - - Synthetic data from sklearn - - OpenML datasets - - Custom loaders for named datasets +## How to Modify Dataset for Compatibility with Scikit-learn_bench + +In order to reuse an existing dataset in scikit-learn_bench, you need to convert its file(s) into compatible format for dataset cache loader. + +Cached dataset consist of few files: +- `{dataset name}.json` file which store required and optional dataset information +- `{dataset name}_{data component name}.{data component extension}` files which store dataset components (data, labels, etc.) + +Example of `{dataset name}.json`: +```json +{"n_classes": 2, "default_split": {"test_size": 0.2, "random_state": 11}} +``` + +`n_classes` property in a dataset info file is *required* for classification datasets. + +Currently, `x` (data) and `y` (labels) are the only supported and *required* data components. + +Scikit-learn_bench-compatible dataset should be stored in `data:cache_directory` (`${PWD}/data_cache` or `{repository root}/data_cache` by default). + +You can specify created compatible dataset in config files the same way as datasets explicitly registered in scikit-learn_bench using its name: +```json +{ + "data": { + "dataset": "{dataset name}" + } +} +``` --- [Documentation tree](../../README.md#-documentation) diff --git a/sklbench/datasets/__init__.py b/sklbench/datasets/__init__.py index be20420e..093875c4 100644 --- a/sklbench/datasets/__init__.py +++ b/sklbench/datasets/__init__.py @@ -22,6 +22,7 @@ from ..utils.custom_types import BenchCase from .loaders import ( dataset_loading_functions, + load_custom_data, load_openml_data, load_sklearn_synthetic_data, ) @@ -47,9 +48,17 @@ def load_data(bench_case: BenchCase) -> Tuple[Dict, Dict]: dataset = get_bench_case_value(bench_case, "data:dataset") if dataset is not None: dataset_params = get_bench_case_value(bench_case, "data:dataset_kwargs", dict()) - return dataset_loading_functions[dataset]( - **common_kwargs, preproc_kwargs=preproc_kwargs, dataset_params=dataset_params - ) + if dataset in dataset_loading_functions: + # registered dataset loading branch + return dataset_loading_functions[dataset]( + **common_kwargs, + preproc_kwargs=preproc_kwargs, + dataset_params=dataset_params, + ) + else: + # user-provided dataset loading branch + return load_custom_data(**common_kwargs, preproc_kwargs=preproc_kwargs) + # load by source source = get_bench_case_value(bench_case, "data:source") if source is not None: diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index 0cc915f0..20df75b2 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -29,7 +29,7 @@ make_regression, ) -from .common import cache, preprocess +from .common import cache, load_data_description, load_data_from_cache, preprocess from .downloaders import ( download_and_read_csv, download_kaggle_files, @@ -84,6 +84,18 @@ def load_sklearn_synthetic_data( return {"x": x, "y": y}, data_desc +@preprocess +def load_custom_data( + data_name: str, + data_cache: str, + raw_data_cache: str, +): + """Function to load data specified by user and stored in format compatible with scikit-learn_bench cache""" + return load_data_from_cache(data_cache, data_name), load_data_description( + data_cache, data_name + ) + + """ Classification datasets """