Skip to content

loading datasets from DATASETSROOT directory #110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 33 additions & 12 deletions datasets/make_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,54 +15,75 @@
# ===============================================================================

import argparse
import logging
import os
import numpy as np
from sklearn.datasets import make_classification, make_regression, make_blobs
from sklearn.utils import check_random_state
import sys


def gen_blobs(args):
def try_gen_dataset(args, folder):
try:
if args.type == 'regression':
gen_regression(args, folder)
elif args.type == 'classification':
gen_classification(args, folder)
elif args.type == 'blobs':
gen_blobs(args, folder)
else:
raise ValueError(f'{args.type} is unknown dataset type')
return True
except BaseException as ex:
logging.warning(f"Internal error generating dataset:\n{ex}")
return False


def gen_blobs(args, folder):
os.makedirs(os.path.join(folder, "data"), exist_ok=True)
X, y = make_blobs(n_samples=args.samples + args.test_samples,
n_features=args.features,
centers=args.clusters,
center_box=(-32, 32),
shuffle=True,
random_state=args.seed)
np.save(args.filex, X[:args.samples])
np.save(os.path.join(folder, args.filex), X[:args.samples])
if args.test_samples != 0:
np.save(args.filextest, X[args.samples:])
np.save(os.path.join(folder, args.filextest), X[args.samples:])
return 0


def gen_regression(args):
def gen_regression(args, folder):
os.makedirs(os.path.join(folder, "data"), exist_ok=True)
rs = check_random_state(args.seed)
X, y = make_regression(n_targets=1,
n_samples=args.samples + args.test_samples,
n_features=args.features,
n_informative=args.features,
bias=rs.normal(0, 3),
random_state=rs)
np.save(args.filex, X[:args.samples])
np.save(args.filey, y[:args.samples])
np.save(os.path.join(folder, args.filex), X[:args.samples])
np.save(os.path.join(folder, args.filey), y[:args.samples])
if args.test_samples != 0:
np.save(args.filextest, X[args.samples:])
np.save(args.fileytest, y[args.samples:])
np.save(os.path.join(folder, args.filextest), X[args.samples:])
np.save(os.path.join(folder, args.fileytest), y[args.samples:])
return 0


def gen_classification(args):
def gen_classification(args, folder):
os.makedirs(os.path.join(folder, "data"), exist_ok=True)
X, y = make_classification(n_samples=args.samples + args.test_samples,
n_features=args.features,
n_informative=args.features,
n_repeated=0,
n_redundant=0,
n_classes=args.classes,
random_state=args.seed)
np.save(args.filex, X[:args.samples])
np.save(os.path.join(folder, args.filex), X[:args.samples])
np.save(args.filey, y[:args.samples])
if args.test_samples != 0:
np.save(args.filextest, X[args.samples:])
np.save(args.fileytest, y[args.samples:])
np.save(os.path.join(folder, args.filextest), X[args.samples:])
np.save(os.path.join(folder, args.fileytest), y[args.samples:])
return 0


Expand Down
90 changes: 59 additions & 31 deletions runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import sys
from typing import Any, Dict, List, Union

import datasets.make_datasets as make_datasets
import utils
from pathlib import Path

Expand Down Expand Up @@ -84,8 +83,16 @@ def get_configs(path: Path) -> List[str]:
stream=sys.stdout, format='%(levelname)s: %(message)s', level=args.verbose)
hostname = socket.gethostname()

# make directory for data if it doesn't exist
os.makedirs('data', exist_ok=True)
env = os.environ.copy()
if 'DATASETSROOT' in env:
datasets_root = env['DATASETSROOT']
logging.info(f'Datasets folder at {datasets_root}')
elif 'DAAL_DATASETS' in env:
datasets_root = env['DAAL_DATASETS']
logging.info(f'Datasets folder at {datasets_root}')
else:
datasets_root = ''
logging.info('Datasets folder is not set, using local folder')

json_result: Dict[str, Union[Dict[str, Any], List[Any]]] = {
'hardware': utils.get_hw_parameters(),
Expand Down Expand Up @@ -155,23 +162,41 @@ def get_configs(path: Path) -> List[str]:
for dataset in params_set['dataset']:
if dataset['source'] in ['csv', 'npy']:
dataset_name = dataset['name'] if 'name' in dataset else 'unknown'
if 'training' not in dataset or \
'x' not in dataset['training'] or \
not utils.find_the_dataset(dataset_name,
dataset['training']['x']):
if 'training' not in dataset or 'x' not in dataset['training']:
logging.warning(
f'Dataset {dataset_name} could not be loaded. \n'
'Check the correct name or expand the download in '
'the folder dataset.')
'Training data for algorithm is not specified'
)
continue
paths = '--file-X-train ' + dataset['training']["x"]

files = {}

files['file-X-train'] = dataset['training']["x"]
if 'y' in dataset['training']:
paths += ' --file-y-train ' + dataset['training']["y"]
files['file-y-train'] = dataset['training']["y"]
if 'testing' in dataset:
paths += ' --file-X-test ' + dataset["testing"]["x"]
files['file-X-test'] = dataset["testing"]["x"]
if 'y' in dataset['testing']:
paths += ' --file-y-test ' + \
dataset["testing"]["y"]
files['file-y-test'] = dataset["testing"]["y"]

dataset_path = utils.find_the_dataset(dataset_name, datasets_root,
files.values())
if dataset_path is None:
logging.warning(
f'Dataset {dataset_name} could not be loaded. \n'
'Check the correct name or expand the download in '
'the folder dataset.'
)
continue
elif not dataset_path and datasets_root:
logging.info(
f'{dataset_name} is taken from local folder'
)

paths = ''
for data_path, data_file in files.items():
paths += f'--{data_path} {os.path.join(dataset_path, data_file)} '

elif dataset['source'] == 'synthetic':
class GenerationArgs:
classes: int
Expand All @@ -186,7 +211,6 @@ class GenerationArgs:
test_samples: int
type: str
gen_args = GenerationArgs()
paths = ''

if 'seed' in params_set:
gen_args.seed = params_set['seed']
Expand All @@ -210,38 +234,42 @@ class GenerationArgs:
file_prefix = f'data/synthetic-{gen_args.type}{cls_num_for_file}-'
file_postfix = f'-{gen_args.samples}x{gen_args.features}.npy'

isfiles = True
files = {}
gen_args.filex = f'{file_prefix}X-train{file_postfix}'
paths += f' --file-X-train {gen_args.filex}'
isfiles = isfiles and os.path.isfile(gen_args.filex)
files['file-X-train'] = gen_args.filex
if gen_args.type not in ['blobs']:
gen_args.filey = f'{file_prefix}y-train{file_postfix}'
paths += f' --file-y-train {gen_args.filey}'
isfiles = isfiles and os.path.isfile(gen_args.filey)
files['file-y-train'] = gen_args.filey

if 'testing' in dataset:
gen_args.test_samples = dataset['testing']['n_samples']
gen_args.filextest = f'{file_prefix}X-test{file_postfix}'
paths += f' --file-X-test {gen_args.filextest}'
isfiles = isfiles and os.path.isfile(gen_args.filextest)
files['file-X-test'] = gen_args.filextest
if gen_args.type not in ['blobs']:
gen_args.fileytest = f'{file_prefix}y-test{file_postfix}'
paths += f' --file-y-test {gen_args.fileytest}'
isfiles = isfiles and os.path.isfile(gen_args.fileytest)
files['file-y-test'] = gen_args.fileytest
else:
gen_args.test_samples = 0
gen_args.filextest = gen_args.filex
files['file-X-test'] = gen_args.filextest
if gen_args.type not in ['blobs']:
gen_args.fileytest = gen_args.filey
files['file-y-test'] = gen_args.filey

if not args.dummy_run and not isfiles:
if gen_args.type == 'regression':
make_datasets.gen_regression(gen_args)
elif gen_args.type == 'classification':
make_datasets.gen_classification(gen_args)
elif gen_args.type == 'blobs':
make_datasets.gen_blobs(gen_args)
dataset_name = f'synthetic_{gen_args.type}'

if not args.dummy_run:
dataset_path = utils.find_or_gen_dataset(gen_args,
datasets_root, files.values())
if dataset_path is None:
logging.warning(
f'Dataset {dataset_name} could not be generated. \n'
)
continue

paths = ''
for data_path, data_file in files.items():
paths += f'--{data_path} {os.path.join(dataset_path, data_file)} '
else:
logging.warning('Unknown dataset source. Only synthetics datasets '
'and csv/npy files are supported now')
Expand Down
38 changes: 33 additions & 5 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@

import json
import os
import pathlib
import platform
import subprocess
import sys
from typing import Any, Dict, List, Tuple, Union, cast
from pathlib import Path
from typing import Any, Dict, Iterable, List, Tuple, Union, cast

from datasets.make_datasets import try_gen_dataset
from datasets.load_datasets import try_load_dataset


Expand Down Expand Up @@ -51,9 +52,36 @@ def filter_stdout(text: str) -> Tuple[str, str]:
return filtered, extra


def find_the_dataset(name: str, fullpath: str) -> bool:
return os.path.isfile(fullpath) or try_load_dataset(
dataset_name=name, output_directory=pathlib.Path(fullpath).parent)
def files_in_folder(folder: str, files: Iterable[str]) -> bool:
for file in files:
if not os.path.isfile(os.path.join(folder, file)):
return False
return True


def find_or_gen_dataset(args: Any, folder: str, files: Iterable[str]):
if files_in_folder("", files):
return ""
if folder:
if files_in_folder(folder, files) or \
try_gen_dataset(args, folder):
return folder
if try_gen_dataset(args, ""):
return ""
return None


def find_the_dataset(name: str, folder: str, files: Iterable[str]):
if files_in_folder("", files):
return ""
if folder:
if files_in_folder(folder, files) or \
try_load_dataset(dataset_name=name,
output_directory=Path(os.path.join(folder, "data"))):
return folder
if try_load_dataset(dataset_name=name, output_directory=Path("data")):
return ""
return None


def read_output_from_command(command: str,
Expand Down