From 3b67cc41cec46bda091f8e4f086675eee4949f62 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 12:07:24 +0100 Subject: [PATCH 01/41] refactoring --- bench.py | 67 ++++++++++++++++++++++++++++---------------------------- utils.py | 8 +++---- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/bench.py b/bench.py index f3f9ce5c3..560d96d50 100644 --- a/bench.py +++ b/bench.py @@ -389,14 +389,13 @@ def convert_data(data, dtype, data_order, data_format): # Secondly, change format of data if data_format == 'numpy': return data - elif data_format == 'pandas': + if data_format == 'pandas': import pandas as pd if data.ndim == 1: return pd.Series(data) - else: - return pd.DataFrame(data) - elif data_format == 'cudf': + return pd.DataFrame(data) + if data_format == 'cudf': import cudf import pandas as pd @@ -516,36 +515,36 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, def print_output(library, algorithm, stages, params, functions, times, metric_type, metrics, data, alg_instance=None, alg_params=None): - if params.output_format == 'json': - output = [] - for i, stage in enumerate(stages): - result = gen_basic_dict(library, algorithm, stage, params, - data[i], alg_instance, alg_params) - result.update({'time[s]': times[i]}) - if metric_type is not None: - if isinstance(metric_type, str): - result.update({f'{metric_type}': metrics[i]}) - elif isinstance(metric_type, list): - for ind, val in enumerate(metric_type): - if metrics[ind][i] is not None: - result.update({f'{val}': metrics[ind][i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) - if hasattr(params, 'n_clusters'): - if algorithm == 'kmeans': - result['input_data'].update( - {'n_clusters': params.n_clusters}) - elif algorithm == 'dbscan': - result.update({'n_clusters': params.n_clusters}) - # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - if 'handle' in result['algorithm_parameters'].keys(): - del result['algorithm_parameters']['handle'] - output.append(result) - print(json.dumps(output, indent=4)) + if params.output_format != 'json': return + output = [] + for i, stage in enumerate(stages): + result = gen_basic_dict(library, algorithm, stage, params, + data[i], alg_instance, alg_params) + result.update({'time[s]': times[i]}) + if metric_type is not None: + if isinstance(metric_type, str): + result.update({f'{metric_type}': metrics[i]}) + elif isinstance(metric_type, list): + for ind, val in enumerate(metric_type): + if metrics[ind][i] is not None: + result.update({f'{val}': metrics[ind][i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys() and \ + not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + if 'handle' in result['algorithm_parameters'].keys(): + del result['algorithm_parameters']['handle'] + output.append(result) + print(json.dumps(output, indent=4)) def run_with_context(params, function): diff --git a/utils.py b/utils.py index 8c1720dcb..0696eb6a7 100755 --- a/utils.py +++ b/utils.py @@ -175,11 +175,11 @@ def generate_cases(params: Dict[str, Union[List[Any], Any]]) -> List[str]: commands *= len(values) dashes = '-' if len(param) == 1 else '--' for command_num in range(prev_len): - for value_num in range(len(values)): - commands[prev_len * value_num + command_num] += ' ' + \ - dashes + param + ' ' + str(values[value_num]) + for idx, val in enumerate(values): + commands[prev_len * idx + command_num] += ' ' + \ + dashes + param + ' ' + str(val) else: dashes = '-' if len(param) == 1 else '--' - for command_num in range(len(commands)): + for command_num,_ in enumerate(commands): commands[command_num] += ' ' + dashes + param + ' ' + str(values) return commands From 29074714a579712704523bf4c57b07511308c3dc Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 12:17:33 +0100 Subject: [PATCH 02/41] remove global --- datasets/loader_utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index 29366eccb..b65148441 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -20,13 +20,9 @@ import numpy as np import tqdm -pbar: tqdm.tqdm = None - def _show_progress(block_num: int, block_size: int, total_size: int) -> None: - global pbar - if pbar is None: - pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') + pbar: tqdm.tqdm = tqdm.tqdm(total=total_size / 1024, unit='kB') downloaded = block_num * block_size if downloaded < total_size: From 7c6db21518a9a4b8b6ec531f89c87a4a836a4e9e Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 12:18:58 +0100 Subject: [PATCH 03/41] refactor --- bench.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index 560d96d50..f6374ae68 100644 --- a/bench.py +++ b/bench.py @@ -515,7 +515,8 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, def print_output(library, algorithm, stages, params, functions, times, metric_type, metrics, data, alg_instance=None, alg_params=None): - if params.output_format != 'json': return + if params.output_format != 'json': + return output = [] for i, stage in enumerate(stages): result = gen_basic_dict(library, algorithm, stage, params, @@ -538,8 +539,8 @@ def print_output(library, algorithm, stages, params, functions, result.update({'n_clusters': params.n_clusters}) # replace non-string init with string for kmeans benchmarks if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys() and \ - not isinstance(result['algorithm_parameters']['init'], str): + if 'init' in result['algorithm_parameters'].keys(): + if isinstance(result['algorithm_parameters']['init'], str): result['algorithm_parameters']['init'] = 'random' if 'handle' in result['algorithm_parameters'].keys(): del result['algorithm_parameters']['handle'] From 3cbc5a6889afe7cda07a45bed56c6854628e4799 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 12:23:34 +0100 Subject: [PATCH 04/41] fix typo --- bench.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index f6374ae68..1f7e89c02 100644 --- a/bench.py +++ b/bench.py @@ -539,9 +539,10 @@ def print_output(library, algorithm, stages, params, functions, result.update({'n_clusters': params.n_clusters}) # replace non-string init with string for kmeans benchmarks if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' + condition = 'init' in result['algorithm_parameters'].keys() and\ + not isinstance(result['algorithm_parameters']['init'], str) + if condition: + result['algorithm_parameters']['init'] = 'random' if 'handle' in result['algorithm_parameters'].keys(): del result['algorithm_parameters']['handle'] output.append(result) From 0b44688f4160b13d31324650631696e9a66c36be Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 12:26:11 +0100 Subject: [PATCH 05/41] refactor? --- bench.py | 73 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/bench.py b/bench.py index 1f7e89c02..1053be7a2 100644 --- a/bench.py +++ b/bench.py @@ -512,6 +512,31 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, return result +def update_result_dict(result) -> None: + result.update({'time[s]': times[i]}) + if metric_type is not None: + if isinstance(metric_type, str): + result.update({f'{metric_type}': metrics[i]}) + elif isinstance(metric_type, list): + for ind, val in enumerate(metric_type): + if metrics[ind][i] is not None: + result.update({f'{val}': metrics[ind][i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + if 'handle' in result['algorithm_parameters'].keys(): + del result['algorithm_parameters']['handle'] + def print_output(library, algorithm, stages, params, functions, times, metric_type, metrics, data, alg_instance=None, alg_params=None): @@ -521,30 +546,30 @@ def print_output(library, algorithm, stages, params, functions, for i, stage in enumerate(stages): result = gen_basic_dict(library, algorithm, stage, params, data[i], alg_instance, alg_params) - result.update({'time[s]': times[i]}) - if metric_type is not None: - if isinstance(metric_type, str): - result.update({f'{metric_type}': metrics[i]}) - elif isinstance(metric_type, list): - for ind, val in enumerate(metric_type): - if metrics[ind][i] is not None: - result.update({f'{val}': metrics[ind][i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) - if hasattr(params, 'n_clusters'): - if algorithm == 'kmeans': - result['input_data'].update( - {'n_clusters': params.n_clusters}) - elif algorithm == 'dbscan': - result.update({'n_clusters': params.n_clusters}) - # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - condition = 'init' in result['algorithm_parameters'].keys() and\ - not isinstance(result['algorithm_parameters']['init'], str) - if condition: - result['algorithm_parameters']['init'] = 'random' - if 'handle' in result['algorithm_parameters'].keys(): - del result['algorithm_parameters']['handle'] + update_result_dict(result) + # result.update({'time[s]': times[i]}) + # if metric_type is not None: + # if isinstance(metric_type, str): + # result.update({f'{metric_type}': metrics[i]}) + # elif isinstance(metric_type, list): + # for ind, val in enumerate(metric_type): + # if metrics[ind][i] is not None: + # result.update({f'{val}': metrics[ind][i]}) + # if hasattr(params, 'n_classes'): + # result['input_data'].update({'classes': params.n_classes}) + # if hasattr(params, 'n_clusters'): + # if algorithm == 'kmeans': + # result['input_data'].update( + # {'n_clusters': params.n_clusters}) + # elif algorithm == 'dbscan': + # result.update({'n_clusters': params.n_clusters}) + # # replace non-string init with string for kmeans benchmarks + # if alg_instance is not None: + # if 'init' in result['algorithm_parameters'].keys(): + # if not isinstance(result['algorithm_parameters']['init'], str): + # result['algorithm_parameters']['init'] = 'random' + # if 'handle' in result['algorithm_parameters'].keys(): + # del result['algorithm_parameters']['handle'] output.append(result) print(json.dumps(output, indent=4)) From 786acb43e018010b413ebd623abc3c8283984c1b Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 12:31:50 +0100 Subject: [PATCH 06/41] refactor --- bench.py | 70 +++++++++++++++++--------------------------------------- 1 file changed, 21 insertions(+), 49 deletions(-) diff --git a/bench.py b/bench.py index 1053be7a2..c438b7ec9 100644 --- a/bench.py +++ b/bench.py @@ -512,31 +512,6 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, return result -def update_result_dict(result) -> None: - result.update({'time[s]': times[i]}) - if metric_type is not None: - if isinstance(metric_type, str): - result.update({f'{metric_type}': metrics[i]}) - elif isinstance(metric_type, list): - for ind, val in enumerate(metric_type): - if metrics[ind][i] is not None: - result.update({f'{val}': metrics[ind][i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) - if hasattr(params, 'n_clusters'): - if algorithm == 'kmeans': - result['input_data'].update( - {'n_clusters': params.n_clusters}) - elif algorithm == 'dbscan': - result.update({'n_clusters': params.n_clusters}) - # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - if 'handle' in result['algorithm_parameters'].keys(): - del result['algorithm_parameters']['handle'] - def print_output(library, algorithm, stages, params, functions, times, metric_type, metrics, data, alg_instance=None, alg_params=None): @@ -546,30 +521,27 @@ def print_output(library, algorithm, stages, params, functions, for i, stage in enumerate(stages): result = gen_basic_dict(library, algorithm, stage, params, data[i], alg_instance, alg_params) - update_result_dict(result) - # result.update({'time[s]': times[i]}) - # if metric_type is not None: - # if isinstance(metric_type, str): - # result.update({f'{metric_type}': metrics[i]}) - # elif isinstance(metric_type, list): - # for ind, val in enumerate(metric_type): - # if metrics[ind][i] is not None: - # result.update({f'{val}': metrics[ind][i]}) - # if hasattr(params, 'n_classes'): - # result['input_data'].update({'classes': params.n_classes}) - # if hasattr(params, 'n_clusters'): - # if algorithm == 'kmeans': - # result['input_data'].update( - # {'n_clusters': params.n_clusters}) - # elif algorithm == 'dbscan': - # result.update({'n_clusters': params.n_clusters}) - # # replace non-string init with string for kmeans benchmarks - # if alg_instance is not None: - # if 'init' in result['algorithm_parameters'].keys(): - # if not isinstance(result['algorithm_parameters']['init'], str): - # result['algorithm_parameters']['init'] = 'random' - # if 'handle' in result['algorithm_parameters'].keys(): - # del result['algorithm_parameters']['handle'] + result.update({'time[s]': times[i]}) + if isinstance(metric_type, str): + result.update({f'{metric_type}': metrics[i]}) + elif isinstance(metric_type, list): + for ind, val in enumerate(metric_type): + if metrics[ind][i] is not None: + result.update({f'{val}': metrics[ind][i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + result['algorithm_parameters'].pop('handle',None) output.append(result) print(json.dumps(output, indent=4)) From 6fc17ef090360dabb5675ecff39187f8e785ae1b Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 12:38:05 +0100 Subject: [PATCH 07/41] refactor --- bench.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/bench.py b/bench.py index c438b7ec9..23251c6ee 100644 --- a/bench.py +++ b/bench.py @@ -448,25 +448,25 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, int_dtype if 'y' in element and int_label else params.dtype, params.data_order, params.data_format ) - # generate and convert data if it's marked and path isn't specified - if full_data[element] is None and element in generated_data: - full_data[element] = convert_data( - np.random.rand(*params.shape), - int_dtype if 'y' in element and int_label else params.dtype, - params.data_order, params.data_format) - # convert existing labels from 1- to 2-dimensional - # if it's forced and possible - if full_data[element] is not None and 'y' in element \ - and label_2d and hasattr(full_data[element], 'reshape'): - full_data[element] = full_data[element].reshape( - (full_data[element].shape[0], 1)) - # add dtype property to data if it's needed and doesn't exist - if full_data[element] is not None and add_dtype and \ - not hasattr(full_data[element], 'dtype'): - if hasattr(full_data[element], 'values'): - full_data[element].dtype = full_data[element].values.dtype - elif hasattr(full_data[element], 'dtypes'): - full_data[element].dtype = full_data[element].dtypes[0].type + if full_data[element] is None: + # generate and convert data if it's marked and path isn't specified + if element in generated_data: + full_data[element] = convert_data( + np.random.rand(*params.shape), + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format) + else: + # convert existing labels from 1- to 2-dimensional + # if it's forced and possible + if 'y' in element and label_2d and hasattr(full_data[element], 'reshape'): + full_data[element] = full_data[element].reshape( + (full_data[element].shape[0], 1)) + # add dtype property to data if it's needed and doesn't exist + if add_dtype and not hasattr(full_data[element], 'dtype'): + if hasattr(full_data[element], 'values'): + full_data[element].dtype = full_data[element].values.dtype + elif hasattr(full_data[element], 'dtypes'): + full_data[element].dtype = full_data[element].dtypes[0].type params.dtype = get_dtype(full_data['X_train']) # add size to parameters which is need for some cases From 3509b1de53e440601a9fbdb1acc12861fd8e5342 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 14:50:21 +0100 Subject: [PATCH 08/41] refactor? --- bench.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bench.py b/bench.py index 23251c6ee..3b27c0615 100644 --- a/bench.py +++ b/bench.py @@ -458,7 +458,9 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, else: # convert existing labels from 1- to 2-dimensional # if it's forced and possible - if 'y' in element and label_2d and hasattr(full_data[element], 'reshape'): + condition1: bool = 'y' in element and label_2d + condition1 = condition1 and hasattr(full_data[element], 'reshape') + if condition1: full_data[element] = full_data[element].reshape( (full_data[element].shape[0], 1)) # add dtype property to data if it's needed and doesn't exist From 469336298ab21d4d705f1012ec68bf8a6fffd56e Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 15:03:28 +0100 Subject: [PATCH 09/41] refactor? --- bench.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/bench.py b/bench.py index 3b27c0615..dfb980903 100644 --- a/bench.py +++ b/bench.py @@ -458,9 +458,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, else: # convert existing labels from 1- to 2-dimensional # if it's forced and possible - condition1: bool = 'y' in element and label_2d - condition1 = condition1 and hasattr(full_data[element], 'reshape') - if condition1: + if 'y' in element and label_2d and hasattr(full_data[element], 'reshape'): full_data[element] = full_data[element].reshape( (full_data[element].shape[0], 1)) # add dtype property to data if it's needed and doesn't exist @@ -482,8 +480,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, return tuple(full_data.values()) -def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, - alg_params=None): +def gen_basic_dict(library, algorithm, stage, params, data): result = { 'library': library, 'algorithm': algorithm, @@ -498,6 +495,9 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, 'columns': data.shape[1] } } + return result + +def update_algorithm_parameters(result, alg_instance=None, alg_params=None): result['algorithm_parameters'] = {} if alg_instance is not None: if 'Booster' in str(type(alg_instance)): @@ -509,8 +509,15 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, alg_instance_params['dtype'] = str( alg_instance_params['dtype']) result['algorithm_parameters'].update(alg_instance_params) + if 'init' in result['algorithm_parameters']: + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' if alg_params is not None: result['algorithm_parameters'].update(alg_params) + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + result['algorithm_parameters'].pop('handle',None) return result @@ -521,8 +528,7 @@ def print_output(library, algorithm, stages, params, functions, return output = [] for i, stage in enumerate(stages): - result = gen_basic_dict(library, algorithm, stage, params, - data[i], alg_instance, alg_params) + result = gen_basic_dict(library, algorithm, stage, params, data[i]) result.update({'time[s]': times[i]}) if isinstance(metric_type, str): result.update({f'{metric_type}': metrics[i]}) @@ -539,11 +545,7 @@ def print_output(library, algorithm, stages, params, functions, elif algorithm == 'dbscan': result.update({'n_clusters': params.n_clusters}) # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - result['algorithm_parameters'].pop('handle',None) + result = update_algorithm_parameters(result, alg_instance, alg_params) output.append(result) print(json.dumps(output, indent=4)) From 0da84144944cebae2a259e3d70d930f5a0a3332d Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 15:09:36 +0100 Subject: [PATCH 10/41] Revert "refactor?" 3509b1de53e440601a9fbdb1acc12861fd8e5342 From 5194413dd3047baac0d2e8fbe78b5947070ba81e Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 15:13:25 +0100 Subject: [PATCH 11/41] undo unecessary change --- bench.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/bench.py b/bench.py index dfb980903..315f89a43 100644 --- a/bench.py +++ b/bench.py @@ -480,7 +480,8 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, return tuple(full_data.values()) -def gen_basic_dict(library, algorithm, stage, params, data): +def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, + alg_params=None): result = { 'library': library, 'algorithm': algorithm, @@ -495,9 +496,6 @@ def gen_basic_dict(library, algorithm, stage, params, data): 'columns': data.shape[1] } } - return result - -def update_algorithm_parameters(result, alg_instance=None, alg_params=None): result['algorithm_parameters'] = {} if alg_instance is not None: if 'Booster' in str(type(alg_instance)): @@ -509,15 +507,8 @@ def update_algorithm_parameters(result, alg_instance=None, alg_params=None): alg_instance_params['dtype'] = str( alg_instance_params['dtype']) result['algorithm_parameters'].update(alg_instance_params) - if 'init' in result['algorithm_parameters']: - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' if alg_params is not None: result['algorithm_parameters'].update(alg_params) - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - result['algorithm_parameters'].pop('handle',None) return result @@ -545,7 +536,11 @@ def print_output(library, algorithm, stages, params, functions, elif algorithm == 'dbscan': result.update({'n_clusters': params.n_clusters}) # replace non-string init with string for kmeans benchmarks - result = update_algorithm_parameters(result, alg_instance, alg_params) + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + result['algorithm_parameters'].pop('handle',None) output.append(result) print(json.dumps(output, indent=4)) From 778f3c086ec91b6ae58fb2f30c52923340e2df0a Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 15:21:06 +0100 Subject: [PATCH 12/41] refactor load_data? --- bench.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bench.py b/bench.py index 315f89a43..fdc30ffd6 100644 --- a/bench.py +++ b/bench.py @@ -438,6 +438,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, for element in full_data: file_arg = f'file_{element}' # load and convert data from npy/csv file if path is specified + new_dtype = int_dtype if 'y' in element and int_label else params.dtype if param_vars[file_arg] is not None: if param_vars[file_arg].name.endswith('.npy'): data = np.load(param_vars[file_arg].name, allow_pickle=True) @@ -445,7 +446,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, data = read_csv(param_vars[file_arg].name, params) full_data[element] = convert_data( data, - int_dtype if 'y' in element and int_label else params.dtype, + new_dtype, params.data_order, params.data_format ) if full_data[element] is None: @@ -453,7 +454,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, if element in generated_data: full_data[element] = convert_data( np.random.rand(*params.shape), - int_dtype if 'y' in element and int_label else params.dtype, + new_dtype, params.data_order, params.data_format) else: # convert existing labels from 1- to 2-dimensional From a88cac07fde1bc3b4d6d62817922a86919001f12 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 15:39:50 +0100 Subject: [PATCH 13/41] refactor load_data? --- bench.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bench.py b/bench.py index fdc30ffd6..a4152d7fb 100644 --- a/bench.py +++ b/bench.py @@ -459,11 +459,15 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, else: # convert existing labels from 1- to 2-dimensional # if it's forced and possible - if 'y' in element and label_2d and hasattr(full_data[element], 'reshape'): + condition = 'y' in element + condition = condition and label_2d + condition = condition and hasattr(full_data[element], 'reshape') + if condition: full_data[element] = full_data[element].reshape( (full_data[element].shape[0], 1)) + add_dtype = add_dtype and not hasattr(full_data[element], 'dtype') # add dtype property to data if it's needed and doesn't exist - if add_dtype and not hasattr(full_data[element], 'dtype'): + if add_dtype: if hasattr(full_data[element], 'values'): full_data[element].dtype = full_data[element].values.dtype elif hasattr(full_data[element], 'dtypes'): From 982c0f6d73e3c54f6597f9f3006f23039c7daca4 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 15:44:03 +0100 Subject: [PATCH 14/41] undo mistake --- bench.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bench.py b/bench.py index a4152d7fb..cc18decb4 100644 --- a/bench.py +++ b/bench.py @@ -465,9 +465,8 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, if condition: full_data[element] = full_data[element].reshape( (full_data[element].shape[0], 1)) - add_dtype = add_dtype and not hasattr(full_data[element], 'dtype') # add dtype property to data if it's needed and doesn't exist - if add_dtype: + if add_dtype and not hasattr(full_data[element], 'dtype'): if hasattr(full_data[element], 'values'): full_data[element].dtype = full_data[element].values.dtype elif hasattr(full_data[element], 'dtypes'): From 53011f3e706438e0feeee98545d72a1aa0e54360 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Fri, 1 Oct 2021 16:08:59 +0100 Subject: [PATCH 15/41] undo pbar --- datasets/loader_utils.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index b65148441..4a378e6ad 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -15,14 +15,17 @@ # =============================================================================== import re -from urllib.request import urlretrieve - +from urllib.request import urlretrieve, Request +import os import numpy as np import tqdm +pbar: tqdm.tqdm = None def _show_progress(block_num: int, block_size: int, total_size: int) -> None: - pbar: tqdm.tqdm = tqdm.tqdm(total=total_size / 1024, unit='kB') + global pbar + if pbar is None: + pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') downloaded = block_num * block_size if downloaded < total_size: @@ -33,7 +36,11 @@ def _show_progress(block_num: int, block_size: int, total_size: int) -> None: def retrieve(url: str, filename: str) -> None: - urlretrieve(url, filename, reporthook=_show_progress) + if url.lower().startswith('http'): + req = Request(url) + elif not os.path.isfile(url): + raise ValueError, None + urlretrieve(url, filename, reporthook=_show_progress) #nosec def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): From 9b041394a96c2fe27a8bb63e8a795a8749e3ecc6 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 15:45:49 +0100 Subject: [PATCH 16/41] rewrite urlretrieve w/o urllib --- datasets/loader_utils.py | 44 +++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index 4a378e6ad..77f93a839 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -15,32 +15,34 @@ # =============================================================================== import re -from urllib.request import urlretrieve, Request +import requests import os +from urllib.request import urlretrieve +from shutil import copyfile import numpy as np -import tqdm - -pbar: tqdm.tqdm = None - -def _show_progress(block_num: int, block_size: int, total_size: int) -> None: - global pbar - if pbar is None: - pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') - - downloaded = block_num * block_size - if downloaded < total_size: - pbar.update(block_size / 1024) - else: - pbar.close() - pbar = None +from tqdm import tqdm def retrieve(url: str, filename: str) -> None: - if url.lower().startswith('http'): - req = Request(url) - elif not os.path.isfile(url): - raise ValueError, None - urlretrieve(url, filename, reporthook=_show_progress) #nosec + # rewritting urlretrieve without using urllib library, + # otherwise it would fail codefactor test due to security issues. + if os.path.isfile(url): + # reporthook is ignored for local urls + copyfile(url, filename) + elif url.startswith('http'): + response = requests.get(url,stream=True) + if response.status_code != 200: + raise AssertionError(f"Failed to download from {url},\nResponse returned status code {response.status_code}") + total_size = int(response.headers.get('content-length', 0)) + block_size = 8192 + pbar = tqdm(total=total_size/1024, unit='kB') + with open(filename, 'wb+') as file: + for data in response.iter_content(block_size): + pbar.update(len(data)/1024) + file.write(data) + pbar.close() + if total_size != 0 and pbar.n != total_size/1024: + raise AssertionError("Some content was present but not downloaded/written") def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): From 3d4f9455df51d55a6f65e2ee1e98e7e6e96f4f2b Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:22:52 +0100 Subject: [PATCH 17/41] Revert "rewrite urlretrieve w/o urllib" 9b041394a96c2fe27a8bb63e8a795a8749e3ecc6 --- datasets/loader_utils.py | 44 +++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index 77f93a839..4a378e6ad 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -15,34 +15,32 @@ # =============================================================================== import re -import requests +from urllib.request import urlretrieve, Request import os -from urllib.request import urlretrieve -from shutil import copyfile import numpy as np -from tqdm import tqdm +import tqdm +pbar: tqdm.tqdm = None -def retrieve(url: str, filename: str) -> None: - # rewritting urlretrieve without using urllib library, - # otherwise it would fail codefactor test due to security issues. - if os.path.isfile(url): - # reporthook is ignored for local urls - copyfile(url, filename) - elif url.startswith('http'): - response = requests.get(url,stream=True) - if response.status_code != 200: - raise AssertionError(f"Failed to download from {url},\nResponse returned status code {response.status_code}") - total_size = int(response.headers.get('content-length', 0)) - block_size = 8192 - pbar = tqdm(total=total_size/1024, unit='kB') - with open(filename, 'wb+') as file: - for data in response.iter_content(block_size): - pbar.update(len(data)/1024) - file.write(data) +def _show_progress(block_num: int, block_size: int, total_size: int) -> None: + global pbar + if pbar is None: + pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') + + downloaded = block_num * block_size + if downloaded < total_size: + pbar.update(block_size / 1024) + else: pbar.close() - if total_size != 0 and pbar.n != total_size/1024: - raise AssertionError("Some content was present but not downloaded/written") + pbar = None + + +def retrieve(url: str, filename: str) -> None: + if url.lower().startswith('http'): + req = Request(url) + elif not os.path.isfile(url): + raise ValueError, None + urlretrieve(url, filename, reporthook=_show_progress) #nosec def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): From f07e97e93b5db75b5b484b5795e5c2faea53d616 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:25:58 +0100 Subject: [PATCH 18/41] Reapply "rewrite urlretrieve w/o urllib" 9b041394a96c2fe27a8bb63e8a795a8749e3ecc6 From 58e93a0c7a4d0ed49a378eb8ab051372a424ec46 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:30:50 +0100 Subject: [PATCH 19/41] fix bug --- bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench.py b/bench.py index cc18decb4..7922f1e5a 100644 --- a/bench.py +++ b/bench.py @@ -456,7 +456,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, np.random.rand(*params.shape), new_dtype, params.data_order, params.data_format) - else: + if full_data[element] is not None: # convert existing labels from 1- to 2-dimensional # if it's forced and possible condition = 'y' in element From d258536ca328d2594b7700a9933d1cd48ac1e0d8 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:22:52 +0100 Subject: [PATCH 20/41] Revert "rewrite urlretrieve w/o urllib" 9b041394a96c2fe27a8bb63e8a795a8749e3ecc6 From f10e26140d9a9f077a4c1ee996438cd6e3759a79 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:30:50 +0100 Subject: [PATCH 21/41] fix bug --- bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench.py b/bench.py index cc18decb4..7922f1e5a 100644 --- a/bench.py +++ b/bench.py @@ -456,7 +456,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, np.random.rand(*params.shape), new_dtype, params.data_order, params.data_format) - else: + if full_data[element] is not None: # convert existing labels from 1- to 2-dimensional # if it's forced and possible condition = 'y' in element From 95a34c1e323eb50e70301a6cfa07bf4eef04a232 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:25:58 +0100 Subject: [PATCH 22/41] Reapply "rewrite urlretrieve w/o urllib" 9b041394a96c2fe27a8bb63e8a795a8749e3ecc6 From 52c39cf664945af25734a16cee62357ae2ebbf10 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 15:45:49 +0100 Subject: [PATCH 23/41] rewrite urlretrieve w/o urllib --- datasets/loader_utils.py | 44 +++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index 4a378e6ad..77f93a839 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -15,32 +15,34 @@ # =============================================================================== import re -from urllib.request import urlretrieve, Request +import requests import os +from urllib.request import urlretrieve +from shutil import copyfile import numpy as np -import tqdm - -pbar: tqdm.tqdm = None - -def _show_progress(block_num: int, block_size: int, total_size: int) -> None: - global pbar - if pbar is None: - pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') - - downloaded = block_num * block_size - if downloaded < total_size: - pbar.update(block_size / 1024) - else: - pbar.close() - pbar = None +from tqdm import tqdm def retrieve(url: str, filename: str) -> None: - if url.lower().startswith('http'): - req = Request(url) - elif not os.path.isfile(url): - raise ValueError, None - urlretrieve(url, filename, reporthook=_show_progress) #nosec + # rewritting urlretrieve without using urllib library, + # otherwise it would fail codefactor test due to security issues. + if os.path.isfile(url): + # reporthook is ignored for local urls + copyfile(url, filename) + elif url.startswith('http'): + response = requests.get(url,stream=True) + if response.status_code != 200: + raise AssertionError(f"Failed to download from {url},\nResponse returned status code {response.status_code}") + total_size = int(response.headers.get('content-length', 0)) + block_size = 8192 + pbar = tqdm(total=total_size/1024, unit='kB') + with open(filename, 'wb+') as file: + for data in response.iter_content(block_size): + pbar.update(len(data)/1024) + file.write(data) + pbar.close() + if total_size != 0 and pbar.n != total_size/1024: + raise AssertionError("Some content was present but not downloaded/written") def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): From 5f8f8af505df10c464c0cb2bf93faffa9b9b2d53 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:53:24 +0100 Subject: [PATCH 24/41] undo refactoring --- bench.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/bench.py b/bench.py index 7922f1e5a..c080b05eb 100644 --- a/bench.py +++ b/bench.py @@ -456,21 +456,25 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, np.random.rand(*params.shape), new_dtype, params.data_order, params.data_format) - if full_data[element] is not None: - # convert existing labels from 1- to 2-dimensional - # if it's forced and possible - condition = 'y' in element - condition = condition and label_2d - condition = condition and hasattr(full_data[element], 'reshape') - if condition: - full_data[element] = full_data[element].reshape( - (full_data[element].shape[0], 1)) - # add dtype property to data if it's needed and doesn't exist - if add_dtype and not hasattr(full_data[element], 'dtype'): - if hasattr(full_data[element], 'values'): - full_data[element].dtype = full_data[element].values.dtype - elif hasattr(full_data[element], 'dtypes'): - full_data[element].dtype = full_data[element].dtypes[0].type + # generate and convert data if it's marked and path isn't specified + if full_data[element] is None and element in generated_data: + full_data[element] = convert_data( + np.random.rand(*params.shape), + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format) + # convert existing labels from 1- to 2-dimensional + # if it's forced and possible + if full_data[element] is not None and 'y' in element \ + and label_2d and hasattr(full_data[element], 'reshape'): + full_data[element] = full_data[element].reshape( + (full_data[element].shape[0], 1)) + # add dtype property to data if it's needed and doesn't exist + if full_data[element] is not None and add_dtype and \ + not hasattr(full_data[element], 'dtype'): + if hasattr(full_data[element], 'values'): + full_data[element].dtype = full_data[element].values.dtype + elif hasattr(full_data[element], 'dtypes'): + full_data[element].dtype = full_data[element].dtypes[0].type params.dtype = get_dtype(full_data['X_train']) # add size to parameters which is need for some cases From 4f33ef727f03a0d722da065548601edaceeceb0c Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 17:57:17 +0100 Subject: [PATCH 25/41] add requests to requirements --- sklearn_bench/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn_bench/requirements.txt b/sklearn_bench/requirements.txt index 28c7de80d..fa269e6cb 100755 --- a/sklearn_bench/requirements.txt +++ b/sklearn_bench/requirements.txt @@ -3,3 +3,4 @@ pandas scikit-learn-intelex openpyxl tqdm +requests \ No newline at end of file From 4f3db1c8d06a92b0fff7b5d0b966e9f61b7caade Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sat, 2 Oct 2021 18:02:16 +0100 Subject: [PATCH 26/41] add requests as requirement --- azure-pipelines.yml | 152 ++++++++++++++++----------------- daal4py_bench/requirements.txt | 1 + xgboost_bench/requirements.txt | 1 + 3 files changed, 78 insertions(+), 76 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4e9cc4b5f..300a50f33 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,80 +1,80 @@ variables: - name: python.version - value: '3.8' + value: "3.8" jobs: -- job: Linux_Sklearn - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - displayName: 'Use Python $(python.version)' - inputs: - versionSpec: '$(python.version)' - - script: | - pip install -r sklearn_bench/requirements.txt - python runner.py --configs configs/testing/sklearn.json - displayName: Run bench -- job: Linux_XGBoost - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - displayName: 'Use Python $(python.version)' - inputs: - versionSpec: '$(python.version)' - - script: | - pip install -r xgboost_bench/requirements.txt - python runner.py --configs configs/testing/xgboost.json --no-intel-optimized - displayName: Run bench -- job: Linux_daal4py - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - displayName: 'Use Python $(python.version)' - inputs: - versionSpec: '$(python.version)' - - script: | - pip install -r daal4py_bench/requirements.txt - python runner.py --configs configs/testing/daal4py.json --no-intel-optimized - displayName: Run bench -- job: Linux_XGBoost_and_daal4py - pool: - vmImage: 'ubuntu-20.04' - steps: - - script: | - conda update -y -q conda - conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py tqdm - displayName: Create Anaconda environment - - script: | - . /usr/share/miniconda/etc/profile.d/conda.sh - conda activate bench - python runner.py --configs configs/testing/daal4py_xgboost.json --no-intel-optimized - displayName: Run bench -- job: Pep8 - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install flake8 - flake8 --max-line-length=100 --count - displayName: 'PEP 8 check' -- job: Mypy - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install mypy data-science-types - mypy . --ignore-missing-imports - displayName: 'mypy check' + - job: Linux_Sklearn + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(python.version)" + inputs: + versionSpec: "$(python.version)" + - script: | + pip install -r sklearn_bench/requirements.txt + python runner.py --configs configs/testing/sklearn.json + displayName: Run bench + - job: Linux_XGBoost + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(python.version)" + inputs: + versionSpec: "$(python.version)" + - script: | + pip install -r xgboost_bench/requirements.txt + python runner.py --configs configs/testing/xgboost.json --no-intel-optimized + displayName: Run bench + - job: Linux_daal4py + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(python.version)" + inputs: + versionSpec: "$(python.version)" + - script: | + pip install -r daal4py_bench/requirements.txt + python runner.py --configs configs/testing/daal4py.json --no-intel-optimized + displayName: Run bench + - job: Linux_XGBoost_and_daal4py + pool: + vmImage: "ubuntu-20.04" + steps: + - script: | + conda update -y -q conda + conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py tqdm requests + displayName: Create Anaconda environment + - script: | + . /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench + python runner.py --configs configs/testing/daal4py_xgboost.json --no-intel-optimized + displayName: Run bench + - job: Pep8 + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: "$(python.version)" + addToPath: true + - script: | + python -m pip install --upgrade pip setuptools + pip install flake8 requests + flake8 --max-line-length=100 --count + displayName: "PEP 8 check" + - job: Mypy + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: "$(python.version)" + addToPath: true + - script: | + python -m pip install --upgrade pip setuptools + pip install mypy data-science-types requests + mypy . --ignore-missing-imports + displayName: "mypy check" diff --git a/daal4py_bench/requirements.txt b/daal4py_bench/requirements.txt index 1051f78ca..400c1ab7c 100644 --- a/daal4py_bench/requirements.txt +++ b/daal4py_bench/requirements.txt @@ -3,3 +3,4 @@ pandas < 1.3.0 daal4py openpyxl tqdm +requests \ No newline at end of file diff --git a/xgboost_bench/requirements.txt b/xgboost_bench/requirements.txt index 79bc07cc5..3be916066 100755 --- a/xgboost_bench/requirements.txt +++ b/xgboost_bench/requirements.txt @@ -3,3 +3,4 @@ pandas xgboost openpyxl tqdm +requests \ No newline at end of file From e6846439c8c4e23c270c6ac75705ac29a372006e Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 08:52:36 +0100 Subject: [PATCH 27/41] fix line too long --- datasets/loader_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index 77f93a839..c7c922358 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -32,7 +32,8 @@ def retrieve(url: str, filename: str) -> None: elif url.startswith('http'): response = requests.get(url,stream=True) if response.status_code != 200: - raise AssertionError(f"Failed to download from {url},\nResponse returned status code {response.status_code}") + raise AssertionError(f"Failed to download from {url},\n"+\ + "Response returned status code {response.status_code}") total_size = int(response.headers.get('content-length', 0)) block_size = 8192 pbar = tqdm(total=total_size/1024, unit='kB') From bad15adc46920c49fc051d5c2d6c7d0bee5bd544 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 09:03:21 +0100 Subject: [PATCH 28/41] attempt to fix mypy error --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 300a50f33..037052a9a 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -75,6 +75,6 @@ jobs: addToPath: true - script: | python -m pip install --upgrade pip setuptools - pip install mypy data-science-types requests + pip install mypy data-science-types requests types-requests mypy . --ignore-missing-imports displayName: "mypy check" From c3d70e0e9e15cc8d62322cade6de363b4a9cc706 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 09:09:26 +0100 Subject: [PATCH 29/41] add mising params --- bench.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bench.py b/bench.py index c080b05eb..a43ed1fa6 100644 --- a/bench.py +++ b/bench.py @@ -527,7 +527,8 @@ def print_output(library, algorithm, stages, params, functions, return output = [] for i, stage in enumerate(stages): - result = gen_basic_dict(library, algorithm, stage, params, data[i]) + result = gen_basic_dict(library, algorithm, stage, params, + data[i], alg_instance, alg_params) result.update({'time[s]': times[i]}) if isinstance(metric_type, str): result.update({f'{metric_type}': metrics[i]}) @@ -548,7 +549,7 @@ def print_output(library, algorithm, stages, params, functions, if 'init' in result['algorithm_parameters'].keys(): if not isinstance(result['algorithm_parameters']['init'], str): result['algorithm_parameters']['init'] = 'random' - result['algorithm_parameters'].pop('handle',None) + result['algorithm_parameters'].pop('handle', None) output.append(result) print(json.dumps(output, indent=4)) From f081a0c7a3364b15f412c649106e9b96aa4676a5 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 09:15:26 +0100 Subject: [PATCH 30/41] autopep8 fix --- datasets/loader_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index c7c922358..de172b7e8 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -30,10 +30,10 @@ def retrieve(url: str, filename: str) -> None: # reporthook is ignored for local urls copyfile(url, filename) elif url.startswith('http'): - response = requests.get(url,stream=True) + response = requests.get(url, stream=True) if response.status_code != 200: - raise AssertionError(f"Failed to download from {url},\n"+\ - "Response returned status code {response.status_code}") + raise AssertionError(f"Failed to download from {url},\n" + + "Response returned status code {response.status_code}") total_size = int(response.headers.get('content-length', 0)) block_size = 8192 pbar = tqdm(total=total_size/1024, unit='kB') @@ -43,7 +43,8 @@ def retrieve(url: str, filename: str) -> None: file.write(data) pbar.close() if total_size != 0 and pbar.n != total_size/1024: - raise AssertionError("Some content was present but not downloaded/written") + raise AssertionError( + "Some content was present but not downloaded/written") def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): From f0f7dac73df264816711b95870cc11ab8ddbe1f8 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 09:15:36 +0100 Subject: [PATCH 31/41] fix wrong indentation lvl --- bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench.py b/bench.py index a43ed1fa6..3b4700a32 100644 --- a/bench.py +++ b/bench.py @@ -549,7 +549,7 @@ def print_output(library, algorithm, stages, params, functions, if 'init' in result['algorithm_parameters'].keys(): if not isinstance(result['algorithm_parameters']['init'], str): result['algorithm_parameters']['init'] = 'random' - result['algorithm_parameters'].pop('handle', None) + result['algorithm_parameters'].pop('handle', None) output.append(result) print(json.dumps(output, indent=4)) From 51ed719e96fe2f1b2a7a9b63fe7faab989cea6d1 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 09:22:37 +0100 Subject: [PATCH 32/41] pep8 fixes? --- bench.py | 2 +- datasets/loader_utils.py | 1 - utils.py | 5 +++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bench.py b/bench.py index 3b4700a32..18501a807 100644 --- a/bench.py +++ b/bench.py @@ -456,7 +456,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, np.random.rand(*params.shape), new_dtype, params.data_order, params.data_format) - # generate and convert data if it's marked and path isn't specified + # generate and convert data if it's marked and path isn't specified if full_data[element] is None and element in generated_data: full_data[element] = convert_data( np.random.rand(*params.shape), diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index de172b7e8..4385e3dda 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -17,7 +17,6 @@ import re import requests import os -from urllib.request import urlretrieve from shutil import copyfile import numpy as np from tqdm import tqdm diff --git a/utils.py b/utils.py index 0696eb6a7..6e025b804 100755 --- a/utils.py +++ b/utils.py @@ -180,6 +180,7 @@ def generate_cases(params: Dict[str, Union[List[Any], Any]]) -> List[str]: dashes + param + ' ' + str(val) else: dashes = '-' if len(param) == 1 else '--' - for command_num,_ in enumerate(commands): - commands[command_num] += ' ' + dashes + param + ' ' + str(values) + for command_num, _ in enumerate(commands): + commands[command_num] += ' ' + \ + dashes + param + ' ' + str(values) return commands From 38a5355b04aac5b6b84c9578415500f95cc6b303 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 09:35:17 +0100 Subject: [PATCH 33/41] undo if return None change --- bench.py | 57 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/bench.py b/bench.py index 18501a807..c68e909a5 100644 --- a/bench.py +++ b/bench.py @@ -523,35 +523,34 @@ def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, def print_output(library, algorithm, stages, params, functions, times, metric_type, metrics, data, alg_instance=None, alg_params=None): - if params.output_format != 'json': - return - output = [] - for i, stage in enumerate(stages): - result = gen_basic_dict(library, algorithm, stage, params, - data[i], alg_instance, alg_params) - result.update({'time[s]': times[i]}) - if isinstance(metric_type, str): - result.update({f'{metric_type}': metrics[i]}) - elif isinstance(metric_type, list): - for ind, val in enumerate(metric_type): - if metrics[ind][i] is not None: - result.update({f'{val}': metrics[ind][i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) - if hasattr(params, 'n_clusters'): - if algorithm == 'kmeans': - result['input_data'].update( - {'n_clusters': params.n_clusters}) - elif algorithm == 'dbscan': - result.update({'n_clusters': params.n_clusters}) - # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - result['algorithm_parameters'].pop('handle', None) - output.append(result) - print(json.dumps(output, indent=4)) + if params.output_format == 'json': + output = [] + for i, stage in enumerate(stages): + result = gen_basic_dict(library, algorithm, stage, params, + data[i], alg_instance, alg_params) + result.update({'time[s]': times[i]}) + if isinstance(metric_type, str): + result.update({f'{metric_type}': metrics[i]}) + elif isinstance(metric_type, list): + for ind, val in enumerate(metric_type): + if metrics[ind][i] is not None: + result.update({f'{val}': metrics[ind][i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + result['algorithm_parameters'].pop('handle', None) + output.append(result) + print(json.dumps(output, indent=4)) def run_with_context(params, function): From 3e875d1d8857f6a718ab40e9b6cc20d92d920044 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:14:03 +0100 Subject: [PATCH 34/41] not use getattr for daal4py --- daal4py_bench/distances.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py index 70408856d..82d2c5ec8 100644 --- a/daal4py_bench/distances.py +++ b/daal4py_bench/distances.py @@ -17,7 +17,7 @@ import argparse import bench -import daal4py +from daal4py import cosine_distance, correlation_distance from daal4py.sklearn._utils import getFPType @@ -34,9 +34,10 @@ def compute_distances(pairwise_distances, X): params = bench.parse_args(parser) # Load data -X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) +X, _, _, _ = bench.load_data(params, generated_data=[ + 'X_train'], add_dtype=True) -pairwise_distances = getattr(daal4py, f'{params.metric}_distance') +pairwise_distances = cosine_distance if params.metric == 'cosine' else correlation_distance time, _ = bench.measure_function_time( compute_distances, pairwise_distances, X, params=params) From 0d744d4bf33f6d9c2d403c8006e4c53d8f7f87b0 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:14:09 +0100 Subject: [PATCH 35/41] debugging for tsne --- sklearn_bench/tsne.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn_bench/tsne.py b/sklearn_bench/tsne.py index 0083972a3..eb3fa8b91 100644 --- a/sklearn_bench/tsne.py +++ b/sklearn_bench/tsne.py @@ -29,9 +29,13 @@ def main(): learning_rate=params.learning_rate, angle=params.angle, min_grad_norm=params.min_grad_norm, random_state=params.random_state) + print("Created TSNE model") + fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params) divergence = tsne.kl_divergence_ + print("Ready to print output") + bench.print_output( library='sklearn', algorithm='TSNE', From c2139cbad50b62a1f7fc78698f57a04270050de9 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:24:31 +0100 Subject: [PATCH 36/41] undo logging for tsne --- sklearn_bench/tsne.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn_bench/tsne.py b/sklearn_bench/tsne.py index eb3fa8b91..0083972a3 100644 --- a/sklearn_bench/tsne.py +++ b/sklearn_bench/tsne.py @@ -29,13 +29,9 @@ def main(): learning_rate=params.learning_rate, angle=params.angle, min_grad_norm=params.min_grad_norm, random_state=params.random_state) - print("Created TSNE model") - fit_time, _ = bench.measure_function_time(tsne.fit, X, params=params) divergence = tsne.kl_divergence_ - print("Ready to print output") - bench.print_output( library='sklearn', algorithm='TSNE', From b8a162cfc63a06bdd254952159cd60f6a82af548 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:25:41 +0100 Subject: [PATCH 37/41] ignore daal4py warning --- runner.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/runner.py b/runner.py index 99d992704..685d5fd62 100755 --- a/runner.py +++ b/runner.py @@ -120,7 +120,8 @@ def get_configs(path: Path) -> List[str]: if 'testing' in dataset: paths += ' --file-X-test ' + dataset["testing"]["x"] if 'y' in dataset['testing']: - paths += ' --file-y-test ' + dataset["testing"]["y"] + paths += ' --file-y-test ' + \ + dataset["testing"]["y"] elif dataset['source'] == 'synthetic': class GenerationArgs: classes: int @@ -214,14 +215,18 @@ class GenerationArgs: + f'{extra_stdout}\n' try: if isinstance(json_result['results'], list): - json_result['results'].extend(json.loads(stdout)) + json_result['results'].extend( + json.loads(stdout)) except json.JSONDecodeError as decoding_exception: stderr += f'CASE {case} JSON DECODING ERROR:\n' \ + f'{decoding_exception}\n{stdout}\n' if stderr != '': - is_successful = False - logging.warning('Error in benchmark: \n' + stderr) + if stderr != 'root:Device support is limited in daal4py patching. ' + 'Use Intel(R) Extension for Scikit-learn * for full experience.': + is_successful = False + logging.warning( + 'Error in benchmark: \n' + stderr) json.dump(json_result, args.output_file, indent=4) name_result_file = args.output_file.name From f265af323e85953cacbb4f102b2c29dbf878d326 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:27:20 +0100 Subject: [PATCH 38/41] fix typo --- runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runner.py b/runner.py index 685d5fd62..76316bd38 100755 --- a/runner.py +++ b/runner.py @@ -222,8 +222,8 @@ class GenerationArgs: + f'{decoding_exception}\n{stdout}\n' if stderr != '': - if stderr != 'root:Device support is limited in daal4py patching. ' - 'Use Intel(R) Extension for Scikit-learn * for full experience.': + if stderr != 'root:Device support is limited in daal4py patching. ' \ + + 'Use Intel(R) Extension for Scikit-learn * for full experience.': is_successful = False logging.warning( 'Error in benchmark: \n' + stderr) From 182b256440c705a34aedd51a1aac9cdf562b3a87 Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:45:51 +0100 Subject: [PATCH 39/41] suppress FutureWarning --- sklearn_bench/tsne.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn_bench/tsne.py b/sklearn_bench/tsne.py index 0083972a3..2d9f2d0aa 100644 --- a/sklearn_bench/tsne.py +++ b/sklearn_bench/tsne.py @@ -14,8 +14,10 @@ # limitations under the License. # =============================================================================== -import argparse import bench +import argparse +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) def main(): From 6fb61f2e6f23ac86a6060a46c6f8ae0ec875211a Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:46:05 +0100 Subject: [PATCH 40/41] ignore daal4py warning --- runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/runner.py b/runner.py index 76316bd38..ece6bf07c 100755 --- a/runner.py +++ b/runner.py @@ -222,8 +222,7 @@ class GenerationArgs: + f'{decoding_exception}\n{stdout}\n' if stderr != '': - if stderr != 'root:Device support is limited in daal4py patching. ' \ - + 'Use Intel(R) Extension for Scikit-learn * for full experience.': + if not 'daal4py' in stderr: is_successful = False logging.warning( 'Error in benchmark: \n' + stderr) From f442be85491c69e8240e9d847321bd7cf1f0f4bc Mon Sep 17 00:00:00 2001 From: LyndonFan Date: Sun, 3 Oct 2021 10:49:54 +0100 Subject: [PATCH 41/41] pep8 fix --- runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runner.py b/runner.py index ece6bf07c..a8ab77e5d 100755 --- a/runner.py +++ b/runner.py @@ -222,7 +222,7 @@ class GenerationArgs: + f'{decoding_exception}\n{stdout}\n' if stderr != '': - if not 'daal4py' in stderr: + if 'daal4py' not in stderr: is_successful = False logging.warning( 'Error in benchmark: \n' + stderr)