From 77ce6b5b128aa40ceca966f284878b3578b97ad6 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 15:47:49 +0000 Subject: [PATCH 01/10] FIX to download selected datasets from CLI This fixes a bug in the script that was completely ignoring the values passed in via CL option. Moreover, extra checks and warning logs/messages are now included regarding what's going on with dataset download. This would be nice to have when downloading various datasets at once. --- datasets/load_datasets.py | 145 ++++++++++++++++++++++++++++++-------- 1 file changed, 117 insertions(+), 28 deletions(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index a4e4f4f4c..aac54d99a 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -21,20 +21,67 @@ from pathlib import Path from typing import Callable, Dict -from .loader_classification import (a_nine_a, airline, airline_ohe, bosch, - census, cifar_binary, codrnanorm, covtype_binary, creditcard, - epsilon, epsilon_16K, epsilon_30K, epsilon_80K, epsilon_100K, - fraud, gisette, hepmass_150K, - higgs, higgs_one_m, higgs_150K, ijcnn, klaverjas, - santander, skin_segmentation, susy) -from .loader_multiclass import (cifar_10, connect, covertype, covtype, letters, mlsr, - mnist, msrank, plasticc, sensit) -from .loader_regression import (abalone, california_housing, fried, higgs_10500K, - medical_charges_nominal, mortgage_first_q, - twodplanes, year_prediction_msd, yolanda, airline_regression) -from .loader_clustering import (cifar_cluster, epsilon_50K_cluster, higgs_one_m_clustering, - hepmass_1M_cluster, hepmass_10K_cluster, mnist_10K_cluster, - road_network_20K_cluster, susy_cluster) +from .loader_classification import ( + a_nine_a, + airline, + airline_ohe, + bosch, + census, + cifar_binary, + codrnanorm, + covtype_binary, + creditcard, + epsilon, + epsilon_16K, + epsilon_30K, + epsilon_80K, + epsilon_100K, + fraud, + gisette, + hepmass_150K, + higgs, + higgs_one_m, + higgs_150K, + ijcnn, + klaverjas, + santander, + skin_segmentation, + susy, +) +from .loader_multiclass import ( + cifar_10, + connect, + covertype, + covtype, + letters, + mlsr, + mnist, + msrank, + plasticc, + sensit, +) +from .loader_regression import ( + abalone, + california_housing, + fried, + higgs_10500K, + medical_charges_nominal, + mortgage_first_q, + twodplanes, + year_prediction_msd, + yolanda, + airline_regression, +) +from .loader_clustering import ( + cifar_cluster, + epsilon_50K_cluster, + higgs_one_m_clustering, + hepmass_1M_cluster, + hepmass_10K_cluster, + mnist_10K_cluster, + road_network_20K_cluster, + susy_cluster, +) dataset_loaders: Dict[str, Callable[[Path], bool]] = { "a9a": a_nine_a, @@ -101,19 +148,32 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: logging.warning(f"Internal error loading dataset:\n{ex}") return False else: - logging.warning(f"There is no script to download the dataset: {dataset_name}. " - "You need to add a dataset or script to load it.") + logging.warning( + f"There is no script to download the dataset: {dataset_name}. " + "You need to add a dataset or script to load it." + ) return False -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description='Use \'-d\' or \'--datasets\' option to enumerate ' - 'dataset(s) that should be downloaded') - parser.add_argument('-l', '--list', action='store_const', - const=True, help='The list of available datasets') - parser.add_argument('-d', '--datasets', type=str, nargs='*', - help='The datasets that should be downloaded.') + description="Use '-d' or '--datasets' option to enumerate " + "dataset(s) that should be downloaded" + ) + parser.add_argument( + "-l", + "--list", + action="store_const", + const=True, + help="The list of available datasets", + ) + parser.add_argument( + "-d", + "--datasets", + type=str, + nargs="*", + help="The datasets that should be downloaded.", + ) args = parser.parse_args() if args.list: @@ -121,11 +181,40 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: print(key) sys.exit(0) - root_dir = Path(os.environ['DATASETSROOT']) + root_dir = Path(os.environ["DATASETSROOT"]) if args.datasets is not None: - for val in dataset_loaders.values(): - val(root_dir) + matched_datasets = set( + filter(lambda name: name in dataset_loaders, args.datasets) + ) + unmatched_datasets = set(args.datasets).difference(matched_datasets) + + print( + '%d dataset%s will be downloaded in "%s": ' + % ( + len(matched_datasets), + "s" if len(matched_datasets) > 1 else "", + root_dir, + ) + ) + for i, name in enumerate(matched_datasets): + try: + print('%d. downloading dataset "%s"' % (i + 1, name)) + dataset_loaders[name](root_dir) + except Exception as e: + logging.warning( + 'An error occurr while downloading dataset "%s". Please check.' + % name + ) + logging.warning(str(e)) + else: + logging.info('Dataset "%s" successfully downloaded.' % name) + + if len(unmatched_datasets): + logging.warning( + "Warning: The following dataset names have not been recognized: " + % unmatched_datasets + ) + else: - logging.warning( - 'Warning: Enumerate dataset(s) that should be downloaded') + logging.warning("Warning: Enumerate dataset(s) that should be downloaded") From 39ed884ac93b8a22a5ddc1f6b5575298ff383d4c Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 16:19:18 +0000 Subject: [PATCH 02/10] simplified log msgs and reused pre-exst function Simplified report messages, and re-used pre-existing load function (not reinventing the wheel). --- datasets/load_datasets.py | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index aac54d99a..0a81e40b8 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -184,37 +184,18 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: root_dir = Path(os.environ["DATASETSROOT"]) if args.datasets is not None: - matched_datasets = set( - filter(lambda name: name in dataset_loaders, args.datasets) - ) - unmatched_datasets = set(args.datasets).difference(matched_datasets) - + ds_names = set(args.datasets) print( - '%d dataset%s will be downloaded in "%s": ' - % ( - len(matched_datasets), - "s" if len(matched_datasets) > 1 else "", - root_dir, - ) + "%d dataset%s have been requested for download: %s" + % (len(ds_names), "s" if len(ds_names) > 1 else "", ds_names) ) - for i, name in enumerate(matched_datasets): - try: - print('%d. downloading dataset "%s"' % (i + 1, name)) - dataset_loaders[name](root_dir) - except Exception as e: - logging.warning( - 'An error occurr while downloading dataset "%s". Please check.' - % name - ) - logging.warning(str(e)) - else: - logging.info('Dataset "%s" successfully downloaded.' % name) + print("Download location: %s" % root_dir) - if len(unmatched_datasets): - logging.warning( - "Warning: The following dataset names have not been recognized: " - % unmatched_datasets - ) + for i, name in enumerate(ds_names): + print('%d. Dataset "%s"' % (i + 1, name)) + downloaded = try_load_dataset(name, root_dir) + if downloaded: + print('Dataset "%s" successfully downloaded.' % name) else: logging.warning("Warning: Enumerate dataset(s) that should be downloaded") From 41e6197e52454ff4967708c25f905b0bbe05be52 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 17:01:38 +0000 Subject: [PATCH 03/10] Switched to f-string and improved CLI description Improved the description of the script when executed with the `--help` option. Moreover, I realised that the reference Python version is 3.9 (also included in Azure pipeline config). Plus f-strings are already used elsewhere. Therefore, all string formattings is now using f-strings. --- datasets/load_datasets.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 0a81e40b8..5c33c6ebd 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -157,8 +157,8 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Use '-d' or '--datasets' option to enumerate " - "dataset(s) that should be downloaded" + description="Utility to download selected datasets included in the benchmark. " + "Use '-d' or '--datasets' option to enumerate dataset(s) that should be downloaded." ) parser.add_argument( "-l", @@ -186,16 +186,15 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: if args.datasets is not None: ds_names = set(args.datasets) print( - "%d dataset%s have been requested for download: %s" - % (len(ds_names), "s" if len(ds_names) > 1 else "", ds_names) + f"{len(ds_names)} dataset{'s' if len(ds_names) > 1 else ''} requested for download: {ds_names}" ) - print("Download location: %s" % root_dir) + print(f"Download location: {root_dir}") for i, name in enumerate(ds_names): - print('%d. Dataset "%s"' % (i + 1, name)) + print(f'{i+1}. Dataset "{name}"') downloaded = try_load_dataset(name, root_dir) if downloaded: - print('Dataset "%s" successfully downloaded.' % name) + print(f'Dataset "{name}" successfully downloaded.') else: logging.warning("Warning: Enumerate dataset(s) that should be downloaded") From 433077085e154e28f72733a53e3acbf31265c9c2 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 17:56:06 +0000 Subject: [PATCH 04/10] New utility to collect DS names from json config New utility script used to collect the names of public datasets used in input benchmark configuration files. These script can be used to gather all the dataset names that is necessary to dowload to run specific set of experiments. --- datasets/collect_dataset_names.py | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 datasets/collect_dataset_names.py diff --git a/datasets/collect_dataset_names.py b/datasets/collect_dataset_names.py new file mode 100644 index 000000000..ffa942962 --- /dev/null +++ b/datasets/collect_dataset_names.py @@ -0,0 +1,77 @@ +# =============================================================================== +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse +import logging +import json +import sys + +def extract_public_dataset_names(exp_filepath: str) -> set[str]: + with open(exp_filepath) as json_config_file: + experiment = json.load(json_config_file) + if not "cases" in experiment: + return [] + dataset_names = list() + for case in experiment["cases"]: + if "dataset" not in case: + continue + for ds in case["dataset"]: + if ds["source"] == "synthethic" or "name" not in ds: + continue + dataset_names.append(ds["name"]) + return set(dataset_names) # remove duplicates + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Utility to gather the list of public dataset names included in benchmark configuration file(s)." + ) + parser.add_argument( + "-f", + "--files", + type=str, + nargs="*", + help="Benchmark configuration file(s).", + ) + parser.add_argument( + "-o", + "--output", + type=str, + help="Writes collected dataset names into an output file. One name per line.", + default="" + ) + args = parser.parse_args() + + if not args.files: + print("Error: Missing input benchmark configuration file(s) to analyze.") + sys.exit(-1) + + names = set() + for config_file in args.files: + names = names.union(extract_public_dataset_names(config_file)) + + if len(names): + print(f"Found {len(names)} dataset(s)") + if args.output: + with open(args.output, "w") as output_file: + for name in names: + output_file.write(f"{name}\n") + print(f"Saved in {args.output}") + else: + for name in names: + print(f"{name}") + else: + logging.error("Warning: No public dataset found in input benchmark file(s).") \ No newline at end of file From 2cc2baf8ea933900de301841a2c9a2d8845b2716 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 17:58:23 +0000 Subject: [PATCH 05/10] New documentation for datasets utilities New documentation readme with instructions on how to use utilities included in the `datasets` package. This doc file adopts a similar structure as in other README doc files used for other packages. --- datasets/README.md | 65 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 datasets/README.md diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 000000000..b90bb11ca --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,65 @@ +# Public dataset include in for scikit-learn_bench + +## Download datasets + +The download selected public datasets included in the benchmark, please run the following command: + +```bash +DATASETSROOT=/path/to/local/data/directory python -m datasets.load_datasets -d +``` + +### Important Notes + +1. [Download location](#download-location) +2. [Execution mode](#execution-mode) + +#### Download location + +The scipt relies on a `DATASETSROOT` environment variable to indicate the local path where +datasets will be automatically downloaded. + +You can set this variable directly via command line when launching the script (see example above). + +Alternatively, you can set this variable in your environment **before** running the script: + +```shell +export DATASETSROOT=/path/to/local/data/directory +``` + +#### Execution Mode + +Please run the `load_datasets` script from the **main** root directory of the `scikit-learn_bench` benchmark. + +Executing the script directly from the `datasets` folder will not work due to issues with relative imports. + +## List of available datasets + +To access the list of all the datasets included in the benchmark, please use the `--list` option: + +```bash +python -m datasets.load_datasets --list +``` + +## Collect dataset names used in experiments + +It is possible to gather the list of public datasets used in benchmark +experiments. +This list can be later used as input to the `load_datasets` script to download +all the data required to run selected benchmarks. + +To collect the names of the dataset included in benchmark configuration file(s), please run: + +```shell +python collect_dataset_names.py -f config_1.json config_2.json ... +``` + +The list of dataset name(s) found will be printed on standard output. + +Alternatively, please use the `--output` (`-o`) option to specify the path of the output file +where this list will be printed instead: + +```shell +python collect_dataset_names.py -f config_1.json config_2.json ... -o dataset_names.txt +``` + + From fd481b58c495f5b5644d5e3e8cdd6d4b601b355a Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 17:59:34 +0000 Subject: [PATCH 06/10] Added section on Data download --- README.md | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5eef0bc2e..fab4e0163 100755 --- a/README.md +++ b/README.md @@ -118,12 +118,38 @@ The configuration of benchmarks allows you to select the frameworks to run, sele |**[GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| |**[GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)**|gbt|:x:|:x:|:x:|:x:|:white_check_mark:| -### Scikit-learn benchmakrs +### Scikit-learn benchmarks When you run scikit-learn benchmarks on CPU, [Intel(R) Extension for Scikit-learn](https://github.com/intel/scikit-learn-intelex) is used by default. Use the ``--no-intel-optimized`` option to run the benchmarks without the extension. For the algorithms with both CPU and GPU support, you may use the same [configuration file](https://github.com/IntelPython/scikit-learn_bench/blob/master/configs/skl_xpu_config.json) to run the scikit-learn benchmarks on CPU and GPU. + +## Downloading Data + +It is possible to download all the required datasets to use in a benchmark, separately. + +This would considerably speed-up the overall execution, whilst also avoiding any networking +issue that might occur during the experiments. + +To download public datasets included in the benchmark please run: + +```bash +DATASETSROOT=/path/to/local/data/folder python -m datasets.load_datasets -d +``` + +### Gather dataset information from benchmark files + +To extract the list of public dataset names used in benchmark configurations, please run: + +```shell + +python ./datasets/collect_dataset_names.py -f config_1.json config_2.json ... +``` + +Please refer to [documentation](./datasets/README.md) for further instructions. + + ## Algorithm parameters You can launch benchmarks for each algorithm separately. From 11c1847b9a06441eb3ce35c1442d5e9af63115c9 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 19:01:55 +0000 Subject: [PATCH 07/10] Merged dataset extraction in download script The load_datasets utility script has been extended with the extra option to download datasets automatically extracted from input configuration files. This would avoid having multiple scripts - to be executed in specific order - while still keeping flexibility to download datasets manually. Documentation has been updated, and improved after this change. --- README.md | 10 ++-- datasets/README.md | 52 ++++++--------------- datasets/collect_dataset_names.py | 77 ------------------------------- datasets/load_datasets.py | 47 +++++++++++++++---- 4 files changed, 58 insertions(+), 128 deletions(-) delete mode 100644 datasets/collect_dataset_names.py diff --git a/README.md b/README.md index fab4e0163..e1b2e94d8 100755 --- a/README.md +++ b/README.md @@ -138,17 +138,15 @@ To download public datasets included in the benchmark please run: DATASETSROOT=/path/to/local/data/folder python -m datasets.load_datasets -d ``` -### Gather dataset information from benchmark files - -To extract the list of public dataset names used in benchmark configurations, please run: +Alternatively, it is also possible to automatically download all datasets included +in benchmark configuration files: ```shell -python ./datasets/collect_dataset_names.py -f config_1.json config_2.json ... +DATASETSROOT=/path/to/local/data/folder python -m datasets.load_datasets -c config_1.json config_2.json ... ``` -Please refer to [documentation](./datasets/README.md) for further instructions. - +Please refer to the [documentation](./datasets/README.md) for further instructions. ## Algorithm parameters diff --git a/datasets/README.md b/datasets/README.md index b90bb11ca..92862212c 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -1,36 +1,26 @@ -# Public dataset include in for scikit-learn_bench - -## Download datasets +# Download Datasets for scikit-learn_bench The download selected public datasets included in the benchmark, please run the following command: ```bash -DATASETSROOT=/path/to/local/data/directory python -m datasets.load_datasets -d +DATASETSROOT=/path/to/local/download/directory python -m datasets.load_datasets -d ``` -### Important Notes - -1. [Download location](#download-location) -2. [Execution mode](#execution-mode) - -#### Download location - -The scipt relies on a `DATASETSROOT` environment variable to indicate the local path where +The scipt relies on a `DATASETSROOT` environment variable, to indicate the local path where datasets will be automatically downloaded. -You can set this variable directly via command line when launching the script (see example above). - -Alternatively, you can set this variable in your environment **before** running the script: +You can alternatively export this variable in your SHELL environment **before** running the script: ```shell -export DATASETSROOT=/path/to/local/data/directory +export DATASETSROOT=/path/to/download/directory ``` +## Important Note -#### Execution Mode +Please **do not** run the `load_datasets` script from within the `datasets` folder. This will not work +due to issues with relative imports. -Please run the `load_datasets` script from the **main** root directory of the `scikit-learn_bench` benchmark. +Please execute the `load_datasets` script directly from the _main_ folder, using the [`-m`](https://docs.python.org/3/using/cmdline.html#cmdoption-m) option with the Python interpreter. -Executing the script directly from the `datasets` folder will not work due to issues with relative imports. ## List of available datasets @@ -40,26 +30,14 @@ To access the list of all the datasets included in the benchmark, please use the python -m datasets.load_datasets --list ``` -## Collect dataset names used in experiments - -It is possible to gather the list of public datasets used in benchmark -experiments. -This list can be later used as input to the `load_datasets` script to download -all the data required to run selected benchmarks. - -To collect the names of the dataset included in benchmark configuration file(s), please run: - -```shell -python collect_dataset_names.py -f config_1.json config_2.json ... -``` - -The list of dataset name(s) found will be printed on standard output. +## Download datasets included in configurations files -Alternatively, please use the `--output` (`-o`) option to specify the path of the output file -where this list will be printed instead: +It is also possible to gather the list of dataset(s) to download directly from +benchmark configuration files by using the `--configs` (`-c`) option: ```shell -python collect_dataset_names.py -f config_1.json config_2.json ... -o dataset_names.txt +DATASETSROOT=/path/to/download/dir python -m datasets.load_datasets -c config_1.json config_2.json ... ``` - +This method will override the `-d` option, and it is highly recommended when +running multiple benchmark experiments. \ No newline at end of file diff --git a/datasets/collect_dataset_names.py b/datasets/collect_dataset_names.py deleted file mode 100644 index ffa942962..000000000 --- a/datasets/collect_dataset_names.py +++ /dev/null @@ -1,77 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse -import logging -import json -import sys - -def extract_public_dataset_names(exp_filepath: str) -> set[str]: - with open(exp_filepath) as json_config_file: - experiment = json.load(json_config_file) - if not "cases" in experiment: - return [] - dataset_names = list() - for case in experiment["cases"]: - if "dataset" not in case: - continue - for ds in case["dataset"]: - if ds["source"] == "synthethic" or "name" not in ds: - continue - dataset_names.append(ds["name"]) - return set(dataset_names) # remove duplicates - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Utility to gather the list of public dataset names included in benchmark configuration file(s)." - ) - parser.add_argument( - "-f", - "--files", - type=str, - nargs="*", - help="Benchmark configuration file(s).", - ) - parser.add_argument( - "-o", - "--output", - type=str, - help="Writes collected dataset names into an output file. One name per line.", - default="" - ) - args = parser.parse_args() - - if not args.files: - print("Error: Missing input benchmark configuration file(s) to analyze.") - sys.exit(-1) - - names = set() - for config_file in args.files: - names = names.union(extract_public_dataset_names(config_file)) - - if len(names): - print(f"Found {len(names)} dataset(s)") - if args.output: - with open(args.output, "w") as output_file: - for name in names: - output_file.write(f"{name}\n") - print(f"Saved in {args.output}") - else: - for name in names: - print(f"{name}") - else: - logging.error("Warning: No public dataset found in input benchmark file(s).") \ No newline at end of file diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 5c33c6ebd..8ec7adc90 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -18,8 +18,10 @@ import logging import os import sys +import json from pathlib import Path from typing import Callable, Dict +from typing import MutableSet as Set from .loader_classification import ( a_nine_a, @@ -155,10 +157,27 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: return False +def extract_dataset_names(config_file: str) -> Set[str]: + with open(config_file) as json_config_file: + experiment = json.load(json_config_file) + + if not "cases" in experiment: + return set() + + datasets = list() + for case in experiment["cases"]: + if "dataset" not in case: + continue + for ds in case["dataset"]: + if ds["source"] == "synthethic" or "name" not in ds: + continue + datasets.append(ds["name"]) + return set(datasets) # remove duplicates + + if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Utility to download selected datasets included in the benchmark. " - "Use '-d' or '--datasets' option to enumerate dataset(s) that should be downloaded." + description="Utility to download selected publicly available datasets included in the benchmark." ) parser.add_argument( "-l", @@ -174,6 +193,13 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: nargs="*", help="The datasets that should be downloaded.", ) + parser.add_argument( + "-c", + "--configs", + type=str, + nargs="*", + help="The benchmark configuration file(s) to gather dataset name(s) to download.", + ) args = parser.parse_args() if args.list: @@ -183,10 +209,18 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: root_dir = Path(os.environ["DATASETSROOT"]) - if args.datasets is not None: - ds_names = set(args.datasets) + if args.datasets is None and args.configs is None: + logging.warning("Warning: Enumerate dataset(s) that should be downloaded") + else: + if args.configs: + print(f"Dataset name(s) to download will be gathered from : {args.configs}") + ds_names = set() + for config_file in args.configs: + ds_names = ds_names.union(extract_dataset_names(config_file)) + else: + ds_names = set(args.datasets) print( - f"{len(ds_names)} dataset{'s' if len(ds_names) > 1 else ''} requested for download: {ds_names}" + f"{len(ds_names)} dataset{'s' if len(ds_names) > 1 else ''} requested for download" ) print(f"Download location: {root_dir}") @@ -195,6 +229,3 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: downloaded = try_load_dataset(name, root_dir) if downloaded: print(f'Dataset "{name}" successfully downloaded.') - - else: - logging.warning("Warning: Enumerate dataset(s) that should be downloaded") From 7ae38889ccce6d5c6214aac9cb6b569c4e84c535 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 19:47:47 +0000 Subject: [PATCH 08/10] MyPy & PEP8 fix --- datasets/load_datasets.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 8ec7adc90..51f55d576 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -161,7 +161,7 @@ def extract_dataset_names(config_file: str) -> Set[str]: with open(config_file) as json_config_file: experiment = json.load(json_config_file) - if not "cases" in experiment: + if "cases" not in experiment: return set() datasets = list() @@ -177,7 +177,8 @@ def extract_dataset_names(config_file: str) -> Set[str]: if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Utility to download selected publicly available datasets included in the benchmark." + description="Utility to download selected publicly available datasets " + "included in the benchmark." ) parser.add_argument( "-l", @@ -214,7 +215,7 @@ def extract_dataset_names(config_file: str) -> Set[str]: else: if args.configs: print(f"Dataset name(s) to download will be gathered from : {args.configs}") - ds_names = set() + ds_names: Set[str] = set() for config_file in args.configs: ds_names = ds_names.union(extract_dataset_names(config_file)) else: From c10996f23e794c07840ed0d627519ccd7f66c381 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 19:51:46 +0000 Subject: [PATCH 09/10] MyPy FIX --- datasets/load_datasets.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 51f55d576..820ef3384 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -21,7 +21,6 @@ import json from pathlib import Path from typing import Callable, Dict -from typing import MutableSet as Set from .loader_classification import ( a_nine_a, @@ -157,7 +156,7 @@ def try_load_dataset(dataset_name: str, output_directory: Path) -> bool: return False -def extract_dataset_names(config_file: str) -> Set[str]: +def extract_dataset_names(config_file: str) -> set[str]: with open(config_file) as json_config_file: experiment = json.load(json_config_file) @@ -215,7 +214,7 @@ def extract_dataset_names(config_file: str) -> Set[str]: else: if args.configs: print(f"Dataset name(s) to download will be gathered from : {args.configs}") - ds_names: Set[str] = set() + ds_names = set() for config_file in args.configs: ds_names = ds_names.union(extract_dataset_names(config_file)) else: From 0e81f453025f473c0ff9e66e21043c104d2a9979 Mon Sep 17 00:00:00 2001 From: Valerio Maggio Date: Thu, 16 Feb 2023 19:53:42 +0000 Subject: [PATCH 10/10] MyPy FIX --- datasets/load_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py index 820ef3384..39139a1b8 100644 --- a/datasets/load_datasets.py +++ b/datasets/load_datasets.py @@ -214,7 +214,7 @@ def extract_dataset_names(config_file: str) -> set[str]: else: if args.configs: print(f"Dataset name(s) to download will be gathered from : {args.configs}") - ds_names = set() + ds_names: set[str] = set() for config_file in args.configs: ds_names = ds_names.union(extract_dataset_names(config_file)) else: