diff --git a/datasets/make_datasets.py b/datasets/make_datasets.py index 4b3629dd1..0add738cb 100644 --- a/datasets/make_datasets.py +++ b/datasets/make_datasets.py @@ -15,26 +15,46 @@ # =============================================================================== import argparse +import logging +import os import numpy as np from sklearn.datasets import make_classification, make_regression, make_blobs from sklearn.utils import check_random_state import sys -def gen_blobs(args): +def try_gen_dataset(args, folder): + try: + if args.type == 'regression': + gen_regression(args, folder) + elif args.type == 'classification': + gen_classification(args, folder) + elif args.type == 'blobs': + gen_blobs(args, folder) + else: + raise ValueError(f'{args.type} is unknown dataset type') + return True + except BaseException as ex: + logging.warning(f"Internal error generating dataset:\n{ex}") + return False + + +def gen_blobs(args, folder): + os.makedirs(os.path.join(folder, "data"), exist_ok=True) X, y = make_blobs(n_samples=args.samples + args.test_samples, n_features=args.features, centers=args.clusters, center_box=(-32, 32), shuffle=True, random_state=args.seed) - np.save(args.filex, X[:args.samples]) + np.save(os.path.join(folder, args.filex), X[:args.samples]) if args.test_samples != 0: - np.save(args.filextest, X[args.samples:]) + np.save(os.path.join(folder, args.filextest), X[args.samples:]) return 0 -def gen_regression(args): +def gen_regression(args, folder): + os.makedirs(os.path.join(folder, "data"), exist_ok=True) rs = check_random_state(args.seed) X, y = make_regression(n_targets=1, n_samples=args.samples + args.test_samples, @@ -42,15 +62,16 @@ def gen_regression(args): n_informative=args.features, bias=rs.normal(0, 3), random_state=rs) - np.save(args.filex, X[:args.samples]) - np.save(args.filey, y[:args.samples]) + np.save(os.path.join(folder, args.filex), X[:args.samples]) + np.save(os.path.join(folder, args.filey), y[:args.samples]) if args.test_samples != 0: - np.save(args.filextest, X[args.samples:]) - np.save(args.fileytest, y[args.samples:]) + np.save(os.path.join(folder, args.filextest), X[args.samples:]) + np.save(os.path.join(folder, args.fileytest), y[args.samples:]) return 0 -def gen_classification(args): +def gen_classification(args, folder): + os.makedirs(os.path.join(folder, "data"), exist_ok=True) X, y = make_classification(n_samples=args.samples + args.test_samples, n_features=args.features, n_informative=args.features, @@ -58,11 +79,11 @@ def gen_classification(args): n_redundant=0, n_classes=args.classes, random_state=args.seed) - np.save(args.filex, X[:args.samples]) + np.save(os.path.join(folder, args.filex), X[:args.samples]) np.save(args.filey, y[:args.samples]) if args.test_samples != 0: - np.save(args.filextest, X[args.samples:]) - np.save(args.fileytest, y[args.samples:]) + np.save(os.path.join(folder, args.filextest), X[args.samples:]) + np.save(os.path.join(folder, args.fileytest), y[args.samples:]) return 0 diff --git a/runner.py b/runner.py index c7b460385..9a9d527c5 100755 --- a/runner.py +++ b/runner.py @@ -22,7 +22,6 @@ import sys from typing import Any, Dict, List, Union -import datasets.make_datasets as make_datasets import utils from pathlib import Path @@ -84,8 +83,16 @@ def get_configs(path: Path) -> List[str]: stream=sys.stdout, format='%(levelname)s: %(message)s', level=args.verbose) hostname = socket.gethostname() - # make directory for data if it doesn't exist - os.makedirs('data', exist_ok=True) + env = os.environ.copy() + if 'DATASETSROOT' in env: + datasets_root = env['DATASETSROOT'] + logging.info(f'Datasets folder at {datasets_root}') + elif 'DAAL_DATASETS' in env: + datasets_root = env['DAAL_DATASETS'] + logging.info(f'Datasets folder at {datasets_root}') + else: + datasets_root = '' + logging.info('Datasets folder is not set, using local folder') json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = { 'hardware': utils.get_hw_parameters(), @@ -155,23 +162,41 @@ def get_configs(path: Path) -> List[str]: for dataset in params_set['dataset']: if dataset['source'] in ['csv', 'npy']: dataset_name = dataset['name'] if 'name' in dataset else 'unknown' - if 'training' not in dataset or \ - 'x' not in dataset['training'] or \ - not utils.find_the_dataset(dataset_name, - dataset['training']['x']): + if 'training' not in dataset or 'x' not in dataset['training']: logging.warning( f'Dataset {dataset_name} could not be loaded. \n' - 'Check the correct name or expand the download in ' - 'the folder dataset.') + 'Training data for algorithm is not specified' + ) continue - paths = '--file-X-train ' + dataset['training']["x"] + + files = {} + + files['file-X-train'] = dataset['training']["x"] if 'y' in dataset['training']: - paths += ' --file-y-train ' + dataset['training']["y"] + files['file-y-train'] = dataset['training']["y"] if 'testing' in dataset: - paths += ' --file-X-test ' + dataset["testing"]["x"] + files['file-X-test'] = dataset["testing"]["x"] if 'y' in dataset['testing']: - paths += ' --file-y-test ' + \ - dataset["testing"]["y"] + files['file-y-test'] = dataset["testing"]["y"] + + dataset_path = utils.find_the_dataset(dataset_name, datasets_root, + files.values()) + if dataset_path is None: + logging.warning( + f'Dataset {dataset_name} could not be loaded. \n' + 'Check the correct name or expand the download in ' + 'the folder dataset.' + ) + continue + elif not dataset_path and datasets_root: + logging.info( + f'{dataset_name} is taken from local folder' + ) + + paths = '' + for data_path, data_file in files.items(): + paths += f'--{data_path} {os.path.join(dataset_path, data_file)} ' + elif dataset['source'] == 'synthetic': class GenerationArgs: classes: int @@ -186,7 +211,6 @@ class GenerationArgs: test_samples: int type: str gen_args = GenerationArgs() - paths = '' if 'seed' in params_set: gen_args.seed = params_set['seed'] @@ -210,38 +234,42 @@ class GenerationArgs: file_prefix = f'data/synthetic-{gen_args.type}{cls_num_for_file}-' file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy' - isfiles = True + files = {} gen_args.filex = f'{file_prefix}X-train{file_postfix}' - paths += f' --file-X-train {gen_args.filex}' - isfiles = isfiles and os.path.isfile(gen_args.filex) + files['file-X-train'] = gen_args.filex if gen_args.type not in ['blobs']: gen_args.filey = f'{file_prefix}y-train{file_postfix}' - paths += f' --file-y-train {gen_args.filey}' - isfiles = isfiles and os.path.isfile(gen_args.filey) + files['file-y-train'] = gen_args.filey if 'testing' in dataset: gen_args.test_samples = dataset['testing']['n_samples'] gen_args.filextest = f'{file_prefix}X-test{file_postfix}' - paths += f' --file-X-test {gen_args.filextest}' - isfiles = isfiles and os.path.isfile(gen_args.filextest) + files['file-X-test'] = gen_args.filextest if gen_args.type not in ['blobs']: gen_args.fileytest = f'{file_prefix}y-test{file_postfix}' - paths += f' --file-y-test {gen_args.fileytest}' - isfiles = isfiles and os.path.isfile(gen_args.fileytest) + files['file-y-test'] = gen_args.fileytest else: gen_args.test_samples = 0 gen_args.filextest = gen_args.filex + files['file-X-test'] = gen_args.filextest if gen_args.type not in ['blobs']: gen_args.fileytest = gen_args.filey + files['file-y-test'] = gen_args.filey - if not args.dummy_run and not isfiles: - if gen_args.type == 'regression': - make_datasets.gen_regression(gen_args) - elif gen_args.type == 'classification': - make_datasets.gen_classification(gen_args) - elif gen_args.type == 'blobs': - make_datasets.gen_blobs(gen_args) dataset_name = f'synthetic_{gen_args.type}' + + if not args.dummy_run: + dataset_path = utils.find_or_gen_dataset(gen_args, + datasets_root, files.values()) + if dataset_path is None: + logging.warning( + f'Dataset {dataset_name} could not be generated. \n' + ) + continue + + paths = '' + for data_path, data_file in files.items(): + paths += f'--{data_path} {os.path.join(dataset_path, data_file)} ' else: logging.warning('Unknown dataset source. Only synthetics datasets ' 'and csv/npy files are supported now') diff --git a/utils.py b/utils.py index 6e025b804..192a5f421 100755 --- a/utils.py +++ b/utils.py @@ -16,12 +16,13 @@ import json import os -import pathlib import platform import subprocess import sys -from typing import Any, Dict, List, Tuple, Union, cast +from pathlib import Path +from typing import Any, Dict, Iterable, List, Tuple, Union, cast +from datasets.make_datasets import try_gen_dataset from datasets.load_datasets import try_load_dataset @@ -51,9 +52,36 @@ def filter_stdout(text: str) -> Tuple[str, str]: return filtered, extra -def find_the_dataset(name: str, fullpath: str) -> bool: - return os.path.isfile(fullpath) or try_load_dataset( - dataset_name=name, output_directory=pathlib.Path(fullpath).parent) +def files_in_folder(folder: str, files: Iterable[str]) -> bool: + for file in files: + if not os.path.isfile(os.path.join(folder, file)): + return False + return True + + +def find_or_gen_dataset(args: Any, folder: str, files: Iterable[str]): + if files_in_folder("", files): + return "" + if folder: + if files_in_folder(folder, files) or \ + try_gen_dataset(args, folder): + return folder + if try_gen_dataset(args, ""): + return "" + return None + + +def find_the_dataset(name: str, folder: str, files: Iterable[str]): + if files_in_folder("", files): + return "" + if folder: + if files_in_folder(folder, files) or \ + try_load_dataset(dataset_name=name, + output_directory=Path(os.path.join(folder, "data"))): + return folder + if try_load_dataset(dataset_name=name, output_directory=Path("data")): + return "" + return None def read_output_from_command(command: str,