diff --git a/README.md b/README.md index 5eef0bc2e..e1b2e94d8 100755 --- a/README.md +++ b/README.md @@ -118,12 +118,36 @@ The configuration of benchmarks allows you to select the frameworks to run, sele |**[GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| |**[GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| -### Scikit-learn benchmakrs +### Scikit-learn benchmarks When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. For the algorithms with both CPU and GPU support, you may use the same [configuration file](https://github.com/IntelPython/scikit-learn_bench/blob/master/configs/skl_xpu_config.json) to run the scikit-learn benchmarks on CPU and GPU. + +## Downloading Data + +It is possible to download all the required datasets to use in a benchmark, separately. + +This would considerably speed-up the overall execution, whilst also avoiding any networking +issue that might occur during the experiments. + +To download public datasets included in the benchmark please run: + +```bash +DATASETSROOT=/path/to/local/data/folder python -m datasets.load_datasets -d +``` + +Alternatively, it is also possible to automatically download all datasets included +in benchmark configuration files: + +```shell + +DATASETSROOT=/path/to/local/data/folder python -m datasets.load_datasets -c config_1.json config_2.json ... +``` + +Please refer to the [documentation](./datasets/README.md) for further instructions. + ## Algorithm parameters You can launch benchmarks for each algorithm separately. diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 000000000..92862212c --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,43 @@ +# Download Datasets for scikit-learn_bench + +The download selected public datasets included in the benchmark, please run the following command: + +```bash +DATASETSROOT=/path/to/local/download/directory python -m datasets.load_datasets -d +``` + +The scipt relies on a `DATASETSROOT` environment variable, to indicate the local path where +datasets will be automatically downloaded. + +You can alternatively export this variable in your SHELL environment **before** running the script: + +```shell +export DATASETSROOT=/path/to/download/directory +``` +## Important Note + +Please **do not** run the `load_datasets` script from within the `datasets` folder. This will not work +due to issues with relative imports. + +Please execute the `load_datasets` script directly from the _main_ folder, using the [`-m`](https://docs.python.org/3/using/cmdline.html#cmdoption-m) option with the Python interpreter. + + +## List of available datasets + +To access the list of all the datasets included in the benchmark, please use the `--list` option: + +```bash +python -m datasets.load_datasets --list +``` + +## Download datasets included in configurations files + +It is also possible to gather the list of dataset(s) to download directly from +benchmark configuration files by using the `--configs` (`-c`) option: + +```shell +DATASETSROOT=/path/to/download/dir python -m datasets.load_datasets -c config_1.json config_2.json ... +``` + +This method will override the `-d` option, and it is highly recommended when +running multiple benchmark experiments. \ No newline at end of file diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index a4e4f4f4c..39139a1b8 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -18,23 +18,71 @@ import logging import os import sys +import json from pathlib import Path from typing import Callable, Dict -from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, cifar_binary, codrnanorm, covtype_binary, creditcard, - epsilon, epsilon_16K, epsilon_30K, epsilon_80K, epsilon_100K, - fraud, gisette, hepmass_150K, - higgs, higgs_one_m, higgs_150K, ijcnn, klaverjas, - santander, skin_segmentation, susy) -from .loader_multiclass import (cifar_10, connect, covertype, covtype, letters, mlsr, - mnist, msrank, plasticc, sensit) -from .loader_regression import (abalone, california_housing, fried, higgs_10500K, - medical_charges_nominal, mortgage_first_q, - twodplanes, year_prediction_msd, yolanda, airline_regression) -from .loader_clustering import (cifar_cluster, epsilon_50K_cluster, higgs_one_m_clustering, - hepmass_1M_cluster, hepmass_10K_cluster, mnist_10K_cluster, - road_network_20K_cluster, susy_cluster) +from .loader_classification import ( + a_nine_a, + airline, + airline_ohe, + bosch, + census, + cifar_binary, + codrnanorm, + covtype_binary, + creditcard, + epsilon, + epsilon_16K, + epsilon_30K, + epsilon_80K, + epsilon_100K, + fraud, + gisette, + hepmass_150K, + higgs, + higgs_one_m, + higgs_150K, + ijcnn, + klaverjas, + santander, + skin_segmentation, + susy, +) +from .loader_multiclass import ( + cifar_10, + connect, + covertype, + covtype, + letters, + mlsr, + mnist, + msrank, + plasticc, + sensit, +) +from .loader_regression import ( + abalone, + california_housing, + fried, + higgs_10500K, + medical_charges_nominal, + mortgage_first_q, + twodplanes, + year_prediction_msd, + yolanda, + airline_regression, +) +from .loader_clustering import ( + cifar_cluster, + epsilon_50K_cluster, + higgs_one_m_clustering, + hepmass_1M_cluster, + hepmass_10K_cluster, + mnist_10K_cluster, + road_network_20K_cluster, + susy_cluster, +) dataset_loaders: Dict[str, Callable[[Path], bool]] = { "a9a": a_nine_a, @@ -101,19 +149,57 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: logging.warning(f"Internal error loading dataset:\n{ex}") return False else: - logging.warning(f"There is no script to download the dataset: {dataset_name}. " - "You need to add a dataset or script to load it.") + logging.warning( + f"There is no script to download the dataset: {dataset_name}. " + "You need to add a dataset or script to load it." + ) return False -if __name__ == '__main__': +def extract_dataset_names(config_file: str) -> set[str]: + with open(config_file) as json_config_file: + experiment = json.load(json_config_file) + + if "cases" not in experiment: + return set() + + datasets = list() + for case in experiment["cases"]: + if "dataset" not in case: + continue + for ds in case["dataset"]: + if ds["source"] == "synthethic" or "name" not in ds: + continue + datasets.append(ds["name"]) + return set(datasets) # remove duplicates + + +if __name__ == "__main__": parser = argparse.ArgumentParser( - description='Use \'-d\' or \'--datasets\' option to enumerate ' - 'dataset(s) that should be downloaded') - parser.add_argument('-l', '--list', action='store_const', - const=True, help='The list of available datasets') - parser.add_argument('-d', '--datasets', type=str, nargs='*', - help='The datasets that should be downloaded.') + description="Utility to download selected publicly available datasets " + "included in the benchmark." + ) + parser.add_argument( + "-l", + "--list", + action="store_const", + const=True, + help="The list of available datasets", + ) + parser.add_argument( + "-d", + "--datasets", + type=str, + nargs="*", + help="The datasets that should be downloaded.", + ) + parser.add_argument( + "-c", + "--configs", + type=str, + nargs="*", + help="The benchmark configuration file(s) to gather dataset name(s) to download.", + ) args = parser.parse_args() if args.list: @@ -121,11 +207,25 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: print(key) sys.exit(0) - root_dir = Path(os.environ['DATASETSROOT']) + root_dir = Path(os.environ["DATASETSROOT"]) - if args.datasets is not None: - for val in dataset_loaders.values(): - val(root_dir) + if args.datasets is None and args.configs is None: + logging.warning("Warning: Enumerate dataset(s) that should be downloaded") else: - logging.warning( - 'Warning: Enumerate dataset(s) that should be downloaded') + if args.configs: + print(f"Dataset name(s) to download will be gathered from : {args.configs}") + ds_names: set[str] = set() + for config_file in args.configs: + ds_names = ds_names.union(extract_dataset_names(config_file)) + else: + ds_names = set(args.datasets) + print( + f"{len(ds_names)} dataset{'s' if len(ds_names) > 1 else ''} requested for download" + ) + print(f"Download location: {root_dir}") + + for i, name in enumerate(ds_names): + print(f'{i+1}. Dataset "{name}"') + downloaded = try_load_dataset(name, root_dir) + if downloaded: + print(f'Dataset "{name}" successfully downloaded.')