From d6b0039f52236604e2606fc84dcf3f683a6aec6c Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 4 Aug 2021 13:38:52 +0300 Subject: [PATCH 01/17] initial --- report_generator/README.md | 8 +- .../default_report_gen_config.json | 25 +- report_generator/report_generator.py | 557 +++++++++++++----- 3 files changed, 420 insertions(+), 170 deletions(-) diff --git a/report_generator/README.md b/report_generator/README.md index 3bb2dba4c..3a6faed68 100755 --- a/report_generator/README.md +++ b/report_generator/README.md @@ -2,10 +2,14 @@ Report generator produces Excel table file from json benchmark log files. -Run `python report_generator.py --result-files bench_log_1.json,bench_log_2.json [--report-file new_report.xlsx --generation-config gen_config.json --merging none]` to launch report generation. +Run `python report_generator.py --result-files bench_log_1.json,bench_log_2.json [--report-file new_report.xlsx --generation-config default_report_gen_config.json]` to launch report generation. runner options: * ``result-files`` : comma-separated benchmark json result file paths * ``report-file`` : report file path * ``generation-config`` : generation configuration file path -* ``merging``: *full*, *none*, *sw_only*, *hw_only*. How to merge same cases in benchmark logs + +config parameters: +* ``header`` : Column names in the table header. These parameters are also used to compare reports. If the name is compound, then it should be separated by the '':'' symbol +* ``comparison_method`` : Comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1`` +* ``aggregation_metrics`` : Metric applied to columns with comparisons of two reports. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. \ No newline at end of file diff --git a/report_generator/default_report_gen_config.json b/report_generator/default_report_gen_config.json index 677d12f11..04eef0ba8 100755 --- a/report_generator/default_report_gen_config.json +++ b/report_generator/default_report_gen_config.json @@ -1,5 +1,5 @@ { - "align": [ + "header": [ "algorithm", "stage", "input_data:data_order", @@ -10,9 +10,22 @@ "input_data:classes", "input_data:n_clusters" ], - "diff": [ - "software_hash", - "hardware_hash", - "measurement_time" + "comparison_method": { + "time[s]": "2 / 1", + "davies_bouldin_score": "2 / 1", + "inertia": "2 / 1", + "iter": "2 / 1", + "noise_variance": "2 / 1", + "accuracy": "2 / 1", + "log_loss": "2 / 1", + "roc_auc": "2 / 1", + "rmse": "2 / 1", + "r2_score": "2 / 1", + "n_sv": "2 / 1", + "n_clusters": "2 / 1" + }, + "aggregation_metrics": [ + "geomean", + "average" ] -} +} \ No newline at end of file diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 8f1c0d601..d9edadde8 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -18,82 +18,180 @@ import datetime import hashlib import json -from string import ascii_uppercase -from typing import Any, List +from typing import Any, List, Dict +from openpyxl.formatting.rule import ColorScaleRule +from openpyxl.styles.numbers import FORMAT_NUMBER_00 +from openpyxl.styles import Font +from openpyxl.utils import get_column_letter import openpyxl -def get_property(entry, prop): +def get_property(entry: Dict[str, Any], prop: str): keys = prop.split(':') value = entry for key in keys: + if key not in value: + return None value = value[key] return value -def result_entries_have_same_values(first_entry, second_entry, props, - error_on_missing=True): - res = True - for prop in props: - try: - res = res and \ - (get_property(first_entry, prop) == get_property(second_entry, prop)) - except KeyError: - if error_on_missing: - raise KeyError() - return res - - -def result_entries_are_equal(first_entry, second_entry, config): - props = config['align'] + config['diff'] - return result_entries_have_same_values(first_entry, second_entry, props, True) - - -def result_entries_are_comparable(first_entry, second_entry, config): - props = config['align'] - return result_entries_have_same_values(first_entry, second_entry, props, False) +def xy_to_excel_cell(x: int, y: int) -> str: + return '{}{}'.format(get_column_letter(x + 1), y + 1) -def result_entries_have_same_diff(first_entry, second_entry, config): - props = config['diff'] - return result_entries_have_same_values(first_entry, second_entry, props, False) +def get_excel_cell(work_sheet, x: int, y: int): + return work_sheet[xy_to_excel_cell(x, y)] -def results_are_mergeable(first_res, second_res, merging): - hw_hash_equality = first_res['hardware_hash'] == second_res['hardware_hash'] - sw_hash_equality = first_res['software_hash'] == second_res['software_hash'] - if merging == 'hw_only': - return hw_hash_equality - elif merging == 'sw_only': - return sw_hash_equality - else: - return sw_hash_equality and hw_hash_equality - - -excel_header_columns = list(ascii_uppercase) -for sym1 in ascii_uppercase: - for sym2 in ascii_uppercase: - excel_header_columns.append(sym1 + sym2) +def write_cell(work_sheet, x: int, y: int, value: str, bold=False) -> None: + work_sheet[xy_to_excel_cell(x, y)] = value + work_sheet[xy_to_excel_cell(x, y)].number_format = FORMAT_NUMBER_00 + if bold: + work_sheet[xy_to_excel_cell(x, y)].font = Font(bold=True) -def xy_to_excel_cell(x, y): - return '{}{}'.format(excel_header_columns[x], y + 1) +def is_equal_dict(a: Dict[str, Any], b: Dict[str, Any], props: List[str]) -> bool: + for prop in props: + if get_property(a, prop) != get_property(b, prop): + return False + return True -def write_cell(work_sheet, x, y, value): - work_sheet[xy_to_excel_cell(x, y)] = value +def get_metrics(report: Dict[str, Any]) -> List[str]: + metrics = list() + was = False + for i in report: + if i == "time[s]": + was = True + continue + if was: + metrics.append(i) + return metrics -def create_list(res_entry, props_list): - line = [] - for prop in props_list: - try: - val = get_property(res_entry, prop) - except BaseException: - val = '' - line.append(val) - return line +def make_unique(a: List[Any]) -> List[Any]: + result = list() + d = dict() + for i in a: + if i in d: + continue + d[i] = 1 + result.append(i) + return result + + +def get_range( + start_x: int, + finish_x: int, + start_y: int, + finish_y: int, +) -> str: + return xy_to_excel_cell(start_x, start_y) + ':' + \ + xy_to_excel_cell(finish_x, finish_y) + + +def can_convert_to_float(string: str) -> bool: + try: + float(string) + except ValueError: + return False + return True + + +def write_aggregation_metric( + ws, + write_x: int, + write_y: int, + metric_range: str, + metric_name: str, +) -> None: + metric_string = '=' + metric_name + '(' + metric_range + ')' + write_cell( + ws, + write_x, + write_y, + metric_string, + ) + + +def write_header_of_sheet( + work_sheet, + header_columns: List[str], + y_offset: int, + metrics: List[str], + agg_offset: int, + agg_metrics: List[str], + json_results: List[Dict[str, Any]], + LEFT_OFFSET: int, +) -> None: + # write header + for ind, val in enumerate(header_columns): + write_cell(work_sheet, ind, y_offset, val, bold=True) + # write aggregation metrics + if len(json_results) >= 2: + for ind, val in enumerate(agg_metrics): + write_cell( + work_sheet, + LEFT_OFFSET + len(json_results) - 1, + agg_offset + ind, + val, + bold=True + ) + # write names of metrics and jsons + metric_offset = 0 + for metric in metrics: + write_cell( + work_sheet, + LEFT_OFFSET + metric_offset, + y_offset - 1, + metric, + bold=True, + ) + for json_res in json_results: + write_cell( + work_sheet, + LEFT_OFFSET + metric_offset, + y_offset, + json_res["file_name"], + bold=True, + ) + metric_offset += 1 + for i in range(len(json_results)): + for j in range(i + 1, len(json_results)): + write_cell( + work_sheet, + LEFT_OFFSET + metric_offset, + y_offset, + json_results[i]['file_name'] + ' vs ' + json_results[j]['file_name'], + bold=True, + ) + metric_offset += 1 + + +def get_color_rule(metric: str) -> Any: + if metric in ['geomean', 'time[s]']: + return ColorScaleRule( + start_type='num', start_value=0.5, start_color='FF0000', + mid_type='num', mid_value=1, mid_color='FFFF00', + end_type='num', end_value=5, end_color='00FF00') + if metric == 'average': + return ColorScaleRule( + start_type='num', start_value=-3, start_color='FF0000', + mid_type='num', mid_value=0, mid_color='FFFF00', + end_type='num', end_value=3, end_color='00FF00') + return ColorScaleRule( + start_type='percentile', start_value=10, start_color='FF0000', + mid_type='percentile', mid_value=50, mid_color='FFFF00', + end_type='percentile', end_value=90, end_color='00FF00') + + +def get_ratio_string(a: str, b: str, comparison_method: str) -> str: + splited_comparison_method = comparison_method.split(' ') + if splited_comparison_method[0] == "2": + a, b = b, a + return '=' + a + splited_comparison_method[1] + b parser = argparse.ArgumentParser() @@ -103,20 +201,20 @@ def create_list(res_entry, props_list): default=f'report_{str(datetime.date.today())}.xlsx') parser.add_argument('--generation-config', type=str, default='default_report_gen_config.json') -parser.add_argument('--merging', type=str, default='none', - choices=('full', 'none', 'sw_only', 'hw_only')) args = parser.parse_args() -json_results = [] +# Read input json(s) +json_results: List[Dict[str, Any]] = list() for file_name in args.result_files.split(','): with open(file_name, 'r') as file: - json_results.append(json.load(file)) + res = json.load(file) + res['file_name'] = file_name + json_results.append(res) +# Read config with open(args.generation_config, 'r') as file: gen_config = json.load(file) -wb = openpyxl.Workbook() - # compute hash for software and hardware configurations HASH_LIMIT = 8 for i, json_res in enumerate(json_results): @@ -125,108 +223,242 @@ def create_list(res_entry, props_list): h.update(bytes(str(json_res[ware]), encoding='utf-8')) json_res[f'{ware}_hash'] = h.hexdigest()[:HASH_LIMIT] -# create list of all result entry from all json logs -all_res_entries = [] -for i, json_res in enumerate(json_results): - extra_entry_info = json_res.copy() - del extra_entry_info['results'] - for res_entry in json_res['results']: - new_res_entry = res_entry.copy() - new_res_entry.update(extra_entry_info) - all_res_entries.append(new_res_entry) - -if args.merging != 'none': - for i, resi_entry in enumerate(all_res_entries): - already_exist = False - for j, resj_entry in enumerate(all_res_entries): - if i == j or resi_entry == {} or resj_entry == {}: - continue - if result_entries_are_equal(resi_entry, resj_entry, gen_config): - if resi_entry['measurement_time'] < resj_entry['measurement_time']: - resi_entry = resj_entry - resj_entry = {} - -while {} in all_res_entries: - all_res_entries.remove({}) - -diff_combinations: List[Any] = [] -for i, res_entry in enumerate(all_res_entries): - already_exist = False - for diff_comb in diff_combinations: - if result_entries_have_same_diff(res_entry, diff_comb, gen_config): - already_exist = True - break - if not already_exist: - diff_comb = res_entry.copy() - diff_combinations.append(diff_comb) - -align_combinations: List[Any] = [] -for i, res_entry in enumerate(all_res_entries): - already_exist = False - for align_comb in align_combinations: - if result_entries_are_comparable(res_entry, align_comb, gen_config): - already_exist = True - break - if not already_exist: - align_comb = res_entry.copy() - align_combinations.append(align_comb) - -HEAD_OFFSET = len(gen_config['diff']) -LEFT_OFFSET = len(gen_config['align']) - -stages_splitter = { - 'training': ['training_preparation', 'training', 'computation'], - 'inference': ['prediction_preparation', 'prediction', 'alternative_prediction', - 'transformation', 'search', 'predict_proba'] -} -possible_metrics = {'accuracy', 'accuracy[%]', 'rmse', - 'davies_bouldin_score', 'inertia', 'log_loss'} - -for stage_key in stages_splitter.keys(): - ws = wb.create_sheet(title=f'Results ({stage_key})') - - for i, col in enumerate(gen_config['align'] + ['time, s', 'metric type', 'metric']): - write_cell(ws, i, HEAD_OFFSET, col) - - for i, row in enumerate(gen_config['diff']): - write_cell(ws, LEFT_OFFSET - 1, i, row) - - stage_align_combinations = align_combinations.copy() - for align_comb in align_combinations: - if align_comb['stage'] not in stages_splitter[stage_key]: - stage_align_combinations.remove(align_comb) - - for i, align_comb in enumerate(stage_align_combinations): - arr = create_list(align_comb, gen_config['align']) - for j, el in enumerate(arr): - write_cell(ws, j, HEAD_OFFSET + 1 + i, el) - - for i, diff_comb in enumerate(diff_combinations): - arr = create_list(diff_comb, gen_config['diff']) - for j, el in enumerate(arr): - write_cell(ws, LEFT_OFFSET + i, j, el) - - for i, res_entry in enumerate(all_res_entries): - if res_entry['stage'] not in stages_splitter[stage_key]: +# getting metrics for each algorithm +available_algos_and_metrics: Dict[str, List[str]] = dict() +for json_res in json_results: + for report in json_res['results']: + metrics: List[str] = get_metrics(report) + if report['algorithm'] in available_algos_and_metrics: + available_algos_and_metrics[report['algorithm']] += metrics + else: + available_algos_and_metrics[report['algorithm']] = metrics + +for ind, val in enumerate(available_algos_and_metrics): + available_algos_and_metrics[val] = ['time[s]'] + make_unique(available_algos_and_metrics[val]) + + +HEAD_OFFSET = 3 +LEFT_OFFSET = len(gen_config['header']) +JSON_RESULTS_LEN = len(json_results) + +stages: List[str] = [ + 'training_preparation', + 'training', + 'computation', + 'prediction_preparation', + 'prediction', + 'alternative_prediction', + 'transformation', + 'search', + 'predict_proba', +] + +summary: Dict[str, Dict[str, Dict[str, Dict[str, str]]]] = dict() +wb = openpyxl.Workbook() + +for algo in available_algos_and_metrics: + # algo[:31] because excel warning about length of sheet name no more than 31 symbols + ws = wb.create_sheet(title=f'{algo[:31]}') + # writing table header + for offset, val in enumerate(['file_name', 'software_hash', 'hardware_hash']): + write_cell(ws, 0, offset, val) + for i, json_res in enumerate(json_results): + write_cell(ws, i + 1, offset, json_res[val]) + + y_offset = 0 + for stage_key in stages: + # list of already used results + used = [ + [False for j in range(len(json_results[i]['results']))] + for i in range(len(json_results)) + ] + begin_y_offset = y_offset + for json_res_ind, json_res in enumerate(json_results): + for report_ind, report in enumerate(json_res['results']): + if report['stage'] != stage_key or \ + report['algorithm'] != algo or \ + used[json_res_ind][report_ind] is True: + continue + # write parameters + for offset, config in enumerate(gen_config['header']): + write_cell(ws, offset, HEAD_OFFSET + 1 + y_offset, get_property(report, config)) + # write all metrics in report + metric_offset = 0 + for metric in available_algos_and_metrics[algo]: + write_cell( + ws, + LEFT_OFFSET + metric_offset + json_res_ind, HEAD_OFFSET + 1 + y_offset, + get_property(report, metric), + ) + metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 + used[json_res_ind][report_ind] = True + + # try to find in other configs report with same parameters + for json_res_comp_ind, json_res_comp in enumerate(json_results[json_res_ind + 1:]): + original_index = json_res_ind + 1 + json_res_comp_ind + for report_comp_ind, report_comp in enumerate(json_res_comp['results']): + if report_comp['stage'] != stage_key or \ + report_comp['algorithm'] != algo or \ + used[original_index][report_comp_ind] is True or \ + not is_equal_dict(report, report_comp, gen_config['header']): + continue + metric_offset = 0 + for metric in available_algos_and_metrics[algo]: + write_cell( + ws, + LEFT_OFFSET + original_index + metric_offset, + HEAD_OFFSET + y_offset + 1, + get_property(report_comp, metric) + ) + metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 + used[original_index][report_comp_ind] = True + y_offset += 1 + + if y_offset == begin_y_offset: + # nothing was written, so do not have to write header & do comparison continue - x: int - y: int - for j, align_comb in enumerate(stage_align_combinations): - if result_entries_are_comparable(res_entry, align_comb, gen_config): - y = j - break - for j, diff_comb in enumerate(diff_combinations): - if result_entries_have_same_diff(res_entry, diff_comb, gen_config): - x = j - break - write_cell(ws, LEFT_OFFSET + x, HEAD_OFFSET + 1 + y, res_entry['time[s]']) - for metric_type in possible_metrics: - if metric_type in res_entry: - write_cell(ws, LEFT_OFFSET + x + 1, HEAD_OFFSET + 1 + y, metric_type) - write_cell(ws, LEFT_OFFSET + x + 2, HEAD_OFFSET + 1 + y, res_entry[metric_type]) - break - -# write configs + write_header_of_sheet( + ws, + gen_config['header'], + HEAD_OFFSET + begin_y_offset, + available_algos_and_metrics[algo], + HEAD_OFFSET + y_offset + 1, + gen_config['aggregation_metrics'], + json_results, + LEFT_OFFSET, + ) + # write aggregation metric & save info for summary + metric_offset = JSON_RESULTS_LEN + for metric in available_algos_and_metrics[algo]: + comparison_offset = 0 + for i in range(JSON_RESULTS_LEN): + for j in range(i + 1, JSON_RESULTS_LEN): + # comprasion + for y in range(HEAD_OFFSET + begin_y_offset + 1, HEAD_OFFSET + y_offset + 1): + first_offset = LEFT_OFFSET + i + metric_offset - JSON_RESULTS_LEN + second_offset = LEFT_OFFSET + j + metric_offset - JSON_RESULTS_LEN + first_cell = get_excel_cell(ws, first_offset, y) + second_cell = get_excel_cell(ws, second_offset, y) + + if first_cell.value is None or\ + second_cell.value is None or \ + not can_convert_to_float(str(first_cell.value)) or \ + not can_convert_to_float(str(second_cell.value)): + continue + if metric not in gen_config['comparison_method']: + raise ValueError( + f'Please add comparison_method ' + f'for {metric} in configuration file') + write_cell( + ws, + LEFT_OFFSET + metric_offset + comparison_offset, + y, + get_ratio_string( + xy_to_excel_cell(first_offset, y), + xy_to_excel_cell(second_offset, y), + gen_config['comparison_method'][metric], + ) + ) + # fill comparison range by color rule + ws.conditional_formatting.add( + get_range( + LEFT_OFFSET + metric_offset + comparison_offset, + LEFT_OFFSET + metric_offset + comparison_offset, + HEAD_OFFSET + 1 + begin_y_offset, + HEAD_OFFSET + y_offset, + ), + get_color_rule(metric), + ) + # write aggregation metric + for agg_offset, agg_metric in enumerate(gen_config['aggregation_metrics']): + write_aggregation_metric( + ws, + LEFT_OFFSET + metric_offset + comparison_offset, + HEAD_OFFSET + 1 + y_offset + agg_offset, + get_range( + LEFT_OFFSET + metric_offset + comparison_offset, + LEFT_OFFSET + metric_offset + comparison_offset, + HEAD_OFFSET + 1 + begin_y_offset, + HEAD_OFFSET + y_offset, + ), + agg_metric, + ) + + column_name = \ + json_results[i]['file_name'] + \ + ' vs ' + \ + json_results[j]['file_name'] + \ + ' (' + stage_key + ')' + + cell_name_to_summary = \ + '=' + algo[:31] + '!' + \ + xy_to_excel_cell(LEFT_OFFSET + metric_offset + comparison_offset, + HEAD_OFFSET + 1 + y_offset + agg_offset) + if agg_metric not in summary: + summary[agg_metric] = dict() + if column_name not in summary[agg_metric]: + summary[agg_metric][column_name] = dict() + if algo not in summary[agg_metric][column_name]: + summary[agg_metric][column_name][algo] = dict() + summary[agg_metric][column_name][algo].update( + {f'{metric}': cell_name_to_summary}) + comparison_offset += 1 + metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 + # for comfortable view + y_offset += len(gen_config['aggregation_metrics']) + 3 + +# write summary for each aggregation metric +for agg_metric in gen_config['aggregation_metrics']: + if JSON_RESULTS_LEN == 1: + continue + y_offset = 0 + # write summary + ws = wb.create_sheet('Summary' + f' ({agg_metric})', 0) + for name_ind, name in enumerate(summary[agg_metric]): + # write table name + write_cell(ws, 0, y_offset, name, bold=True) + # getting unique list of metrics on current comparison + metrics_in_current_summary = list() + for algo in summary[agg_metric][name]: + for metric in summary[agg_metric][name][algo]: + metrics_in_current_summary.append(metric) + metrics_in_current_summary = make_unique(metrics_in_current_summary) + + # fill table + for metric_ind, metric in enumerate(metrics_in_current_summary): + # write metric name + write_cell(ws, metric_ind + 1, y_offset + 1, metric) + for algo_ind, algo in enumerate(summary[agg_metric][name]): + if metric not in summary[agg_metric][name][algo]: + continue + # write algorithm name + write_cell( + ws, + 0, + y_offset + algo_ind + 2, + algo + ) + # write geomean + write_cell( + ws, + metric_ind + 1, + y_offset + algo_ind + 2, + summary[agg_metric][name][algo][metric] + ) + + # color some range by color rule + ws.conditional_formatting.add( + get_range( + 1, + len(metrics_in_current_summary), + y_offset + 2, + y_offset + len(summary[agg_metric][name]) + 1, + ), + get_color_rule(agg_metric), + ) + y_offset += len(summary[agg_metric][name]) + 3 + +# write hardware & software configs for i, json_res in enumerate(json_results): ws = wb.create_sheet(title=f"SW config n{i}_{json_res['software_hash']}") ws[xy_to_excel_cell(0, 0)] = \ @@ -242,4 +474,5 @@ def create_list(res_entry, props_list): for j in range(len(hw_conf)): ws[xy_to_excel_cell(0, 1 + j)] = hw_conf[j] +wb.remove(wb['Sheet']) wb.save(args.report_file) From 36f15bb0518c726543d80e14f28ad8333d25593f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 4 Aug 2021 13:55:27 +0300 Subject: [PATCH 02/17] codefactor & EOF --- report_generator/default_report_gen_config.json | 2 +- report_generator/report_generator.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/report_generator/default_report_gen_config.json b/report_generator/default_report_gen_config.json index 04eef0ba8..d39b56d27 100755 --- a/report_generator/default_report_gen_config.json +++ b/report_generator/default_report_gen_config.json @@ -28,4 +28,4 @@ "geomean", "average" ] -} \ No newline at end of file +} diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index d9edadde8..1af2992a3 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -141,6 +141,7 @@ def write_header_of_sheet( ) # write names of metrics and jsons metric_offset = 0 + json_results_len = len(json_results) for metric in metrics: write_cell( work_sheet, @@ -158,8 +159,9 @@ def write_header_of_sheet( bold=True, ) metric_offset += 1 - for i in range(len(json_results)): - for j in range(i + 1, len(json_results)): + + for i in range(json_results_len): + for j in range(i + 1, json_results_len): write_cell( work_sheet, LEFT_OFFSET + metric_offset, @@ -464,15 +466,15 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: ws[xy_to_excel_cell(0, 0)] = \ f"Software configuration {i} (hash: {json_res['software_hash']})" sw_conf = json.dumps(json_res['software'], indent=4).split('\n') - for j in range(len(sw_conf)): - ws[xy_to_excel_cell(0, 1 + j)] = sw_conf[j] + for j, val in enumerate(sw_conf): + ws[xy_to_excel_cell(0, 1 + j)] = val ws = wb.create_sheet(title=f"HW config n{i}_{json_res['hardware_hash']}") ws[xy_to_excel_cell(0, 0)] = \ f"Hardware configuration {i} (hash: {json_res['hardware_hash']})" hw_conf = json.dumps(json_res['hardware'], indent=4).split('\n') - for j in range(len(hw_conf)): - ws[xy_to_excel_cell(0, 1 + j)] = hw_conf[j] + for j, val in enumerate(hw_conf): + ws[xy_to_excel_cell(0, 1 + j)] = val wb.remove(wb['Sheet']) wb.save(args.report_file) From 00ecd5ef3f28616bed458d568d55fb1ce360ff1f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 4 Aug 2021 13:59:23 +0300 Subject: [PATCH 03/17] pep8 --- report_generator/report_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 1af2992a3..ac13950ca 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -159,7 +159,6 @@ def write_header_of_sheet( bold=True, ) metric_offset += 1 - for i in range(json_results_len): for j in range(i + 1, json_results_len): write_cell( From 80749e6bf236b75a9ea98ffc96b40d6c04000e8f Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Thu, 5 Aug 2021 10:48:16 +0300 Subject: [PATCH 04/17] Update report_generator.py --- report_generator/report_generator.py | 39 +++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index ac13950ca..34cff73aa 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -20,7 +20,6 @@ import json from typing import Any, List, Dict from openpyxl.formatting.rule import ColorScaleRule -from openpyxl.styles.numbers import FORMAT_NUMBER_00 from openpyxl.styles import Font from openpyxl.utils import get_column_letter @@ -45,9 +44,9 @@ def get_excel_cell(work_sheet, x: int, y: int): return work_sheet[xy_to_excel_cell(x, y)] -def write_cell(work_sheet, x: int, y: int, value: str, bold=False) -> None: +def write_cell(work_sheet, x: int, y: int, value: str, *, bold=False, number_format='General') -> None: work_sheet[xy_to_excel_cell(x, y)] = value - work_sheet[xy_to_excel_cell(x, y)].number_format = FORMAT_NUMBER_00 + work_sheet[xy_to_excel_cell(x, y)].number_format = number_format if bold: work_sheet[xy_to_excel_cell(x, y)].font = Font(bold=True) @@ -113,6 +112,7 @@ def write_aggregation_metric( write_x, write_y, metric_string, + number_format='0.00', ) @@ -141,7 +141,6 @@ def write_header_of_sheet( ) # write names of metrics and jsons metric_offset = 0 - json_results_len = len(json_results) for metric in metrics: write_cell( work_sheet, @@ -159,8 +158,8 @@ def write_header_of_sheet( bold=True, ) metric_offset += 1 - for i in range(json_results_len): - for j in range(i + 1, json_results_len): + for i in range(len(json_results)): + for j in range(i + 1, len(json_results)): write_cell( work_sheet, LEFT_OFFSET + metric_offset, @@ -238,8 +237,8 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: available_algos_and_metrics[val] = ['time[s]'] + make_unique(available_algos_and_metrics[val]) -HEAD_OFFSET = 3 -LEFT_OFFSET = len(gen_config['header']) +HEAD_OFFSET = 4 +LEFT_OFFSET = len(gen_config['align']) JSON_RESULTS_LEN = len(json_results) stages: List[str] = [ @@ -281,7 +280,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: used[json_res_ind][report_ind] is True: continue # write parameters - for offset, config in enumerate(gen_config['header']): + for offset, config in enumerate(gen_config['align']): write_cell(ws, offset, HEAD_OFFSET + 1 + y_offset, get_property(report, config)) # write all metrics in report metric_offset = 0 @@ -290,6 +289,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: ws, LEFT_OFFSET + metric_offset + json_res_ind, HEAD_OFFSET + 1 + y_offset, get_property(report, metric), + number_format='0.00', ) metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 used[json_res_ind][report_ind] = True @@ -301,7 +301,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: if report_comp['stage'] != stage_key or \ report_comp['algorithm'] != algo or \ used[original_index][report_comp_ind] is True or \ - not is_equal_dict(report, report_comp, gen_config['header']): + not is_equal_dict(report, report_comp, gen_config['align']): continue metric_offset = 0 for metric in available_algos_and_metrics[algo]: @@ -309,7 +309,8 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: ws, LEFT_OFFSET + original_index + metric_offset, HEAD_OFFSET + y_offset + 1, - get_property(report_comp, metric) + get_property(report_comp, metric), + number_format='0.00', ) metric_offset += JSON_RESULTS_LEN * (JSON_RESULTS_LEN + 1) // 2 used[original_index][report_comp_ind] = True @@ -320,7 +321,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: continue write_header_of_sheet( ws, - gen_config['header'], + gen_config['align'], HEAD_OFFSET + begin_y_offset, available_algos_and_metrics[algo], HEAD_OFFSET + y_offset + 1, @@ -358,7 +359,8 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: xy_to_excel_cell(first_offset, y), xy_to_excel_cell(second_offset, y), gen_config['comparison_method'][metric], - ) + ), + number_format='0.00', ) # fill comparison range by color rule ws.conditional_formatting.add( @@ -444,7 +446,8 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: ws, metric_ind + 1, y_offset + algo_ind + 2, - summary[agg_metric][name][algo][metric] + summary[agg_metric][name][algo][metric], + number_format='0.00', ) # color some range by color rule @@ -465,15 +468,15 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: ws[xy_to_excel_cell(0, 0)] = \ f"Software configuration {i} (hash: {json_res['software_hash']})" sw_conf = json.dumps(json_res['software'], indent=4).split('\n') - for j, val in enumerate(sw_conf): - ws[xy_to_excel_cell(0, 1 + j)] = val + for j in range(len(sw_conf)): + ws[xy_to_excel_cell(0, 1 + j)] = sw_conf[j] ws = wb.create_sheet(title=f"HW config n{i}_{json_res['hardware_hash']}") ws[xy_to_excel_cell(0, 0)] = \ f"Hardware configuration {i} (hash: {json_res['hardware_hash']})" hw_conf = json.dumps(json_res['hardware'], indent=4).split('\n') - for j, val in enumerate(hw_conf): - ws[xy_to_excel_cell(0, 1 + j)] = val + for j in range(len(hw_conf)): + ws[xy_to_excel_cell(0, 1 + j)] = hw_conf[j] wb.remove(wb['Sheet']) wb.save(args.report_file) From 135e28192596b94d1cc735df4c90701f1ce26cfd Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Thu, 5 Aug 2021 10:48:40 +0300 Subject: [PATCH 05/17] Update default_report_gen_config.json --- report_generator/default_report_gen_config.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/report_generator/default_report_gen_config.json b/report_generator/default_report_gen_config.json index d39b56d27..2b1b4843b 100755 --- a/report_generator/default_report_gen_config.json +++ b/report_generator/default_report_gen_config.json @@ -25,7 +25,6 @@ "n_clusters": "2 / 1" }, "aggregation_metrics": [ - "geomean", - "average" + "geomean" ] } From 120c02ae01ac122480fdc258b1e7cf32e87e796b Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Thu, 5 Aug 2021 10:49:11 +0300 Subject: [PATCH 06/17] Update README.md --- report_generator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/report_generator/README.md b/report_generator/README.md index 3a6faed68..49040e31f 100755 --- a/report_generator/README.md +++ b/report_generator/README.md @@ -12,4 +12,4 @@ runner options: config parameters: * ``header`` : Column names in the table header. These parameters are also used to compare reports. If the name is compound, then it should be separated by the '':'' symbol * ``comparison_method`` : Comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1`` -* ``aggregation_metrics`` : Metric applied to columns with comparisons of two reports. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. \ No newline at end of file +* ``aggregation_metrics`` : Metric applied to columns with comparisons of two reports. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. For example: ``geomean`` or ``average``. From 4640f0435b54999c970b0410b8a6d0a0533ec5e7 Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Thu, 5 Aug 2021 10:54:41 +0300 Subject: [PATCH 07/17] Update report_generator.py --- report_generator/report_generator.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 34cff73aa..46214963f 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -141,6 +141,7 @@ def write_header_of_sheet( ) # write names of metrics and jsons metric_offset = 0 + json_results_len = len(json_results) for metric in metrics: write_cell( work_sheet, @@ -158,8 +159,8 @@ def write_header_of_sheet( bold=True, ) metric_offset += 1 - for i in range(len(json_results)): - for j in range(i + 1, len(json_results)): + for i in range(json_results_len): + for j in range(i + 1, json_results_len): write_cell( work_sheet, LEFT_OFFSET + metric_offset, @@ -238,7 +239,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: HEAD_OFFSET = 4 -LEFT_OFFSET = len(gen_config['align']) +LEFT_OFFSET = len(gen_config['header']) JSON_RESULTS_LEN = len(json_results) stages: List[str] = [ @@ -280,7 +281,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: used[json_res_ind][report_ind] is True: continue # write parameters - for offset, config in enumerate(gen_config['align']): + for offset, config in enumerate(gen_config['header']): write_cell(ws, offset, HEAD_OFFSET + 1 + y_offset, get_property(report, config)) # write all metrics in report metric_offset = 0 @@ -301,7 +302,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: if report_comp['stage'] != stage_key or \ report_comp['algorithm'] != algo or \ used[original_index][report_comp_ind] is True or \ - not is_equal_dict(report, report_comp, gen_config['align']): + not is_equal_dict(report, report_comp, gen_config['header']): continue metric_offset = 0 for metric in available_algos_and_metrics[algo]: @@ -321,7 +322,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: continue write_header_of_sheet( ws, - gen_config['align'], + gen_config['header'], HEAD_OFFSET + begin_y_offset, available_algos_and_metrics[algo], HEAD_OFFSET + y_offset + 1, @@ -468,15 +469,15 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: ws[xy_to_excel_cell(0, 0)] = \ f"Software configuration {i} (hash: {json_res['software_hash']})" sw_conf = json.dumps(json_res['software'], indent=4).split('\n') - for j in range(len(sw_conf)): - ws[xy_to_excel_cell(0, 1 + j)] = sw_conf[j] + for j, val in enumerate(sw_conf): + ws[xy_to_excel_cell(0, 1 + j)] = val ws = wb.create_sheet(title=f"HW config n{i}_{json_res['hardware_hash']}") ws[xy_to_excel_cell(0, 0)] = \ f"Hardware configuration {i} (hash: {json_res['hardware_hash']})" hw_conf = json.dumps(json_res['hardware'], indent=4).split('\n') - for j in range(len(hw_conf)): - ws[xy_to_excel_cell(0, 1 + j)] = hw_conf[j] + for j, val in enumerate(hw_conf): + ws[xy_to_excel_cell(0, 1 + j)] = val wb.remove(wb['Sheet']) wb.save(args.report_file) From 0066e174d4ecc5dfc9c1833dcdcc36fdf991306c Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Thu, 5 Aug 2021 11:03:48 +0300 Subject: [PATCH 08/17] Update report_generator.py --- report_generator/report_generator.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 46214963f..38f96fd4b 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -44,7 +44,15 @@ def get_excel_cell(work_sheet, x: int, y: int): return work_sheet[xy_to_excel_cell(x, y)] -def write_cell(work_sheet, x: int, y: int, value: str, *, bold=False, number_format='General') -> None: +def write_cell( + work_sheet, + x: int, + y: int, + value: str, + *, + bold=False, + number_format='General', +) -> None: work_sheet[xy_to_excel_cell(x, y)] = value work_sheet[xy_to_excel_cell(x, y)].number_format = number_format if bold: @@ -137,11 +145,11 @@ def write_header_of_sheet( LEFT_OFFSET + len(json_results) - 1, agg_offset + ind, val, - bold=True + bold=True, ) # write names of metrics and jsons metric_offset = 0 - json_results_len = len(json_results) + json_results_len = len(json_results) for metric in metrics: write_cell( work_sheet, From ceffffc39c9b90549b2a907025fdd1d704b124e2 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Thu, 5 Aug 2021 22:09:19 +0300 Subject: [PATCH 09/17] apply comments --- report_generator/README.md | 4 +- .../default_report_gen_config.json | 13 +--- report_generator/report_generator.py | 77 ++++++++++++------- 3 files changed, 53 insertions(+), 41 deletions(-) diff --git a/report_generator/README.md b/report_generator/README.md index 49040e31f..9114a83f2 100755 --- a/report_generator/README.md +++ b/report_generator/README.md @@ -10,6 +10,6 @@ runner options: * ``generation-config`` : generation configuration file path config parameters: -* ``header`` : Column names in the table header. These parameters are also used to compare reports. If the name is compound, then it should be separated by the '':'' symbol -* ``comparison_method`` : Comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1`` +* ``header`` : Column names in the table header. These parameters are also used to compare reports. If the name is compound, then it should be separated by the '':'' symbol. +* ``comparison_method`` : Comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1``. Default is ``2 / 1``. * ``aggregation_metrics`` : Metric applied to columns with comparisons of two reports. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. For example: ``geomean`` or ``average``. diff --git a/report_generator/default_report_gen_config.json b/report_generator/default_report_gen_config.json index 2b1b4843b..a4f75a5ec 100755 --- a/report_generator/default_report_gen_config.json +++ b/report_generator/default_report_gen_config.json @@ -11,18 +11,7 @@ "input_data:n_clusters" ], "comparison_method": { - "time[s]": "2 / 1", - "davies_bouldin_score": "2 / 1", - "inertia": "2 / 1", - "iter": "2 / 1", - "noise_variance": "2 / 1", - "accuracy": "2 / 1", - "log_loss": "2 / 1", - "roc_auc": "2 / 1", - "rmse": "2 / 1", - "r2_score": "2 / 1", - "n_sv": "2 / 1", - "n_clusters": "2 / 1" + "default": "2 / 1" }, "aggregation_metrics": [ "geomean" diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index 38f96fd4b..b74f6946a 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -126,23 +126,24 @@ def write_aggregation_metric( def write_header_of_sheet( work_sheet, + algorithm: str, header_columns: List[str], y_offset: int, metrics: List[str], agg_offset: int, agg_metrics: List[str], json_results: List[Dict[str, Any]], - LEFT_OFFSET: int, + left_offset: int, ) -> None: # write header for ind, val in enumerate(header_columns): - write_cell(work_sheet, ind, y_offset, val, bold=True) + write_cell(work_sheet, ind, y_offset, val.split(':')[-1], bold=True) # write aggregation metrics if len(json_results) >= 2: for ind, val in enumerate(agg_metrics): write_cell( work_sheet, - LEFT_OFFSET + len(json_results) - 1, + left_offset + len(json_results) - 1, agg_offset + ind, val, bold=True, @@ -153,7 +154,7 @@ def write_header_of_sheet( for metric in metrics: write_cell( work_sheet, - LEFT_OFFSET + metric_offset, + left_offset + metric_offset, y_offset - 1, metric, bold=True, @@ -161,7 +162,7 @@ def write_header_of_sheet( for json_res in json_results: write_cell( work_sheet, - LEFT_OFFSET + metric_offset, + left_offset + metric_offset, y_offset, json_res["file_name"], bold=True, @@ -171,7 +172,7 @@ def write_header_of_sheet( for j in range(i + 1, json_results_len): write_cell( work_sheet, - LEFT_OFFSET + metric_offset, + left_offset + metric_offset, y_offset, json_results[i]['file_name'] + ' vs ' + json_results[j]['file_name'], bold=True, @@ -180,27 +181,51 @@ def write_header_of_sheet( def get_color_rule(metric: str) -> Any: + red = 'F85D5E' + yellow = 'FAF52E' + green = '58C144' if metric in ['geomean', 'time[s]']: return ColorScaleRule( - start_type='num', start_value=0.5, start_color='FF0000', - mid_type='num', mid_value=1, mid_color='FFFF00', - end_type='num', end_value=5, end_color='00FF00') + start_type='num', start_value=0.5, start_color=red, + mid_type='num', mid_value=1, mid_color=yellow, + end_type='num', end_value=5, end_color=green) if metric == 'average': return ColorScaleRule( - start_type='num', start_value=-3, start_color='FF0000', - mid_type='num', mid_value=0, mid_color='FFFF00', - end_type='num', end_value=3, end_color='00FF00') + start_type='num', start_value=-3, start_color=red, + mid_type='num', mid_value=0, mid_color=yellow, + end_type='num', end_value=3, end_color=green) return ColorScaleRule( - start_type='percentile', start_value=10, start_color='FF0000', - mid_type='percentile', mid_value=50, mid_color='FFFF00', - end_type='percentile', end_value=90, end_color='00FF00') + start_type='percentile', start_value=10, start_color=red, + mid_type='percentile', mid_value=50, mid_color=yellow, + end_type='percentile', end_value=90, end_color=green) + + +def get_comparison_method(config: Dict[str, str], metric: str) -> str: + return config[metric] if metric in config else config['default'] def get_ratio_string(a: str, b: str, comparison_method: str) -> str: - splited_comparison_method = comparison_method.split(' ') - if splited_comparison_method[0] == "2": + splitted_comparison_method = comparison_method.split(' ') + if splitted_comparison_method[0] == "2": a, b = b, a - return '=' + a + splited_comparison_method[1] + b + return '=' + a + splitted_comparison_method[1] + b + + +def get_header_parameters( + json_results: List[Dict[str, Any]], + full_header_parameters: List[str], + algorithm: str, +) -> List[str]: + for json_res in json_results: + for report in json_res['results']: + if report['algorithm'] != algorithm: + continue + result = list() + for param in full_header_parameters: + if get_property(report, param) is not None: + result.append(param) + return result + raise ValueError(f'There is no {algorithm} in input json(s)') parser = argparse.ArgumentParser() @@ -247,7 +272,6 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: HEAD_OFFSET = 4 -LEFT_OFFSET = len(gen_config['header']) JSON_RESULTS_LEN = len(json_results) stages: List[str] = [ @@ -268,6 +292,8 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: for algo in available_algos_and_metrics: # algo[:31] because excel warning about length of sheet name no more than 31 symbols ws = wb.create_sheet(title=f'{algo[:31]}') + header_params = get_header_parameters(json_results, gen_config['header'], algo) + LEFT_OFFSET = len(header_params) # writing table header for offset, val in enumerate(['file_name', 'software_hash', 'hardware_hash']): write_cell(ws, 0, offset, val) @@ -289,7 +315,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: used[json_res_ind][report_ind] is True: continue # write parameters - for offset, config in enumerate(gen_config['header']): + for offset, config in enumerate(header_params): write_cell(ws, offset, HEAD_OFFSET + 1 + y_offset, get_property(report, config)) # write all metrics in report metric_offset = 0 @@ -310,7 +336,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: if report_comp['stage'] != stage_key or \ report_comp['algorithm'] != algo or \ used[original_index][report_comp_ind] is True or \ - not is_equal_dict(report, report_comp, gen_config['header']): + not is_equal_dict(report, report_comp, header_params): continue metric_offset = 0 for metric in available_algos_and_metrics[algo]: @@ -330,7 +356,8 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: continue write_header_of_sheet( ws, - gen_config['header'], + algo, + header_params, HEAD_OFFSET + begin_y_offset, available_algos_and_metrics[algo], HEAD_OFFSET + y_offset + 1, @@ -356,10 +383,6 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: not can_convert_to_float(str(first_cell.value)) or \ not can_convert_to_float(str(second_cell.value)): continue - if metric not in gen_config['comparison_method']: - raise ValueError( - f'Please add comparison_method ' - f'for {metric} in configuration file') write_cell( ws, LEFT_OFFSET + metric_offset + comparison_offset, @@ -367,7 +390,7 @@ def get_ratio_string(a: str, b: str, comparison_method: str) -> str: get_ratio_string( xy_to_excel_cell(first_offset, y), xy_to_excel_cell(second_offset, y), - gen_config['comparison_method'][metric], + get_comparison_method(gen_config['comparison_method'], metric), ), number_format='0.00', ) From a1020ab0442b2dad07dd8e85c12e042edc626a00 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Fri, 6 Aug 2021 07:52:46 +0300 Subject: [PATCH 10/17] round ratio cell --- report_generator/report_generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/report_generator/report_generator.py b/report_generator/report_generator.py index b74f6946a..d486e88a3 100755 --- a/report_generator/report_generator.py +++ b/report_generator/report_generator.py @@ -204,11 +204,11 @@ def get_comparison_method(config: Dict[str, str], metric: str) -> str: return config[metric] if metric in config else config['default'] -def get_ratio_string(a: str, b: str, comparison_method: str) -> str: +def get_ratio_string(a: str, b: str, comparison_method: str, num_digits=3) -> str: splitted_comparison_method = comparison_method.split(' ') if splitted_comparison_method[0] == "2": a, b = b, a - return '=' + a + splitted_comparison_method[1] + b + return '=ROUND(' + a + splitted_comparison_method[1] + b + f',{num_digits})' def get_header_parameters( @@ -392,7 +392,7 @@ def get_header_parameters( xy_to_excel_cell(second_offset, y), get_comparison_method(gen_config['comparison_method'], metric), ), - number_format='0.00', + number_format='0.000', ) # fill comparison range by color rule ws.conditional_formatting.add( From ef4849e90f81abca3f4e4bebc0e04ccc3fb7842a Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Fri, 6 Aug 2021 11:49:19 +0300 Subject: [PATCH 11/17] Update report_generator/README.md Co-authored-by: Ekaterina Mekhnetsova --- report_generator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/report_generator/README.md b/report_generator/README.md index 9114a83f2..08d0e7f92 100755 --- a/report_generator/README.md +++ b/report_generator/README.md @@ -11,5 +11,5 @@ runner options: config parameters: * ``header`` : Column names in the table header. These parameters are also used to compare reports. If the name is compound, then it should be separated by the '':'' symbol. -* ``comparison_method`` : Comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1``. Default is ``2 / 1``. +* ``comparison_method``: The formula for the comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1``, where ``1`` is the first result and ``2`` is the second result. The default is ``2 / 1``, which returns the ratio of the second result to the first one. * ``aggregation_metrics`` : Metric applied to columns with comparisons of two reports. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. For example: ``geomean`` or ``average``. From cf481fc8f643956a5b2b4463c8797cfa6173be34 Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Fri, 6 Aug 2021 11:49:52 +0300 Subject: [PATCH 12/17] Update report_generator/README.md Co-authored-by: Ekaterina Mekhnetsova --- report_generator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/report_generator/README.md b/report_generator/README.md index 08d0e7f92..485c95f51 100755 --- a/report_generator/README.md +++ b/report_generator/README.md @@ -10,6 +10,6 @@ runner options: * ``generation-config`` : generation configuration file path config parameters: -* ``header`` : Column names in the table header. These parameters are also used to compare reports. If the name is compound, then it should be separated by the '':'' symbol. +* ``header``: The column names in the table header. These parameters are also used to compare reports. If a name is compound, use the ``:`` symbol to separate its parts. * ``comparison_method``: The formula for the comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1``, where ``1`` is the first result and ``2`` is the second result. The default is ``2 / 1``, which returns the ratio of the second result to the first one. * ``aggregation_metrics`` : Metric applied to columns with comparisons of two reports. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. For example: ``geomean`` or ``average``. From f4f16b1c29118ea852520d926a8a0b866116c827 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Fri, 6 Aug 2021 12:42:12 +0300 Subject: [PATCH 13/17] apply readme comments --- report_generator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/report_generator/README.md b/report_generator/README.md index 485c95f51..6e5a28eb0 100755 --- a/report_generator/README.md +++ b/report_generator/README.md @@ -12,4 +12,4 @@ runner options: config parameters: * ``header``: The column names in the table header. These parameters are also used to compare reports. If a name is compound, use the ``:`` symbol to separate its parts. * ``comparison_method``: The formula for the comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1``, where ``1`` is the first result and ``2`` is the second result. The default is ``2 / 1``, which returns the ratio of the second result to the first one. -* ``aggregation_metrics`` : Metric applied to columns with comparisons of two reports. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. For example: ``geomean`` or ``average``. +* ``aggregation_metrics`` : Metrics applied to columns with comparisons of two reports. You can use multiple metrics. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. For example: ``"geomean", "average"``. From 0135bd5947885b42565aeff0c84597d6121fac53 Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Fri, 6 Aug 2021 13:53:51 +0300 Subject: [PATCH 14/17] Update report_generator/README.md Co-authored-by: Ekaterina Mekhnetsova --- report_generator/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/report_generator/README.md b/report_generator/README.md index 6e5a28eb0..a5871d512 100755 --- a/report_generator/README.md +++ b/report_generator/README.md @@ -12,4 +12,4 @@ runner options: config parameters: * ``header``: The column names in the table header. These parameters are also used to compare reports. If a name is compound, use the ``:`` symbol to separate its parts. * ``comparison_method``: The formula for the comparison of two results. The options are: ``1 operation 2`` or ``2 operation 1``, where ``1`` is the first result and ``2`` is the second result. The default is ``2 / 1``, which returns the ratio of the second result to the first one. -* ``aggregation_metrics`` : Metrics applied to columns with comparisons of two reports. You can use multiple metrics. For each of these metrics, a separate sheet with a summary is compiled. It is important that the function is in Excel. For example: ``"geomean", "average"``. +* ``aggregation_metrics``: The metrics applied to the columns with the comparisons of two reports. You can use multiple metrics. For each of these metrics, a separate sheet with a summary is compiled. The metrics should be Excel functions. For example: ``"geomean", "average"``. From d776d3b96d91830de74881b3f84775feb5f0f7fa Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 11 Aug 2021 13:37:05 +0300 Subject: [PATCH 15/17] elastic-net->elasticnet && add iter in lasso & elasticnet && pca n_components int->float --- sklearn_bench/elasticnet.py | 10 ++-- sklearn_bench/knn_regr.py | 100 ++++++++++++++++++++++++++++++++++++ sklearn_bench/lasso.py | 8 ++- sklearn_bench/pca.py | 2 +- 4 files changed, 114 insertions(+), 6 deletions(-) create mode 100755 sklearn_bench/knn_regr.py diff --git a/sklearn_bench/elasticnet.py b/sklearn_bench/elasticnet.py index 764995e0b..89e820e6f 100755 --- a/sklearn_bench/elasticnet.py +++ b/sklearn_bench/elasticnet.py @@ -44,13 +44,17 @@ def main(): bench.print_output( library='sklearn', - algorithm='elastic-net', + algorithm='elasticnet', stages=['training', 'prediction'], params=params, functions=['ElasticNet.fit', 'ElasticNet.predict'], times=[fit_time, predict_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score', 'iter'], + metrics=[ + [train_rmse, test_rmse], + [train_r2, test_r2], + [int(regr.n_iter_), int(regr.n_iter_)], + ], data=[X_train, X_train], alg_instance=regr, ) diff --git a/sklearn_bench/knn_regr.py b/sklearn_bench/knn_regr.py new file mode 100755 index 000000000..afa7dafde --- /dev/null +++ b/sklearn_bench/knn_regr.py @@ -0,0 +1,100 @@ +# =============================================================================== +# Copyright 2020-2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import argparse + +import bench +import numpy as np + + +def main(): + from sklearn.neighbors import KNeighborsRegressor + + # Load generated data + X_train, X_test, y_train, y_test = bench.load_data(params) + params.n_classes = len(np.unique(y_train)) + + # Create regression object + knn_regr = KNeighborsRegressor(n_neighbors=params.n_neighbors, + weights=params.weights, + algorithm=params.method, + metric=params.metric, + n_jobs=params.n_jobs) + + # Measure time and accuracy on fitting + train_time, _ = bench.measure_function_time( + knn_regr.fit, X_train, y_train, params=params) + if params.task == 'regression': + y_pred = knn_regr.predict(X_train) + train_rmse = bench.rmse_score(y_train, y_pred) + train_r2 = bench.r2_score(y_train, y_pred) + + # Measure time and accuracy on prediction + if params.task == 'regression': + predict_time, yp = bench.measure_function_time(knn_regr.predict, X_test, + params=params) + test_rmse = bench.rmse_score(y_test, yp) + test_r2 = bench.r2_score(y_test, yp) + else: + predict_time, _ = bench.measure_function_time(knn_regr.kneighbors, X_test, + params=params) + + if params.task == 'regression': + bench.print_output( + library='sklearn', + algorithm=knn_regr._fit_method + '_knn_regression', + stages=['training', 'prediction'], + params=params, + functions=['knn_regr.fit', 'knn_regr.predict'], + times=[train_time, predict_time], + metric_type=['rmse', 'r2_score'], + metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], + data=[X_train, X_test], + alg_instance=knn_regr, + ) + else: + bench.print_output( + library='sklearn', + algorithm=knn_regr._fit_method + '_knn_search', + stages=['training', 'search'], + params=params, + functions=['knn_regr.fit', 'knn_regr.kneighbors'], + times=[train_time, predict_time], + metric_type=None, + metrics=[], + data=[X_train, X_test], + alg_instance=knn_regr, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='scikit-learn kNN classifier benchmark') + + parser.add_argument('--task', default='regression', type=str, + choices=('search', 'regression'), + help='kNN task: search or regression') + parser.add_argument('--n-neighbors', default=5, type=int, + help='Number of neighbors to use') + parser.add_argument('--weights', type=str, default='uniform', + help='Weight function used in prediction') + parser.add_argument('--method', type=str, default='brute', + choices=('brute', 'kd_tree', 'ball_tree', 'auto'), + help='Algorithm used to compute the nearest neighbors') + parser.add_argument('--metric', type=str, default='euclidean', + help='Distance metric to use') + params = bench.parse_args(parser) + bench.run_with_context(params, main) diff --git a/sklearn_bench/lasso.py b/sklearn_bench/lasso.py index d0e10cb7c..6346d5e8a 100755 --- a/sklearn_bench/lasso.py +++ b/sklearn_bench/lasso.py @@ -49,8 +49,12 @@ def main(): params=params, functions=['Lasso.fit', 'Lasso.predict'], times=[fit_time, predict_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], + metric_type=['rmse', 'r2_score', 'iter'], + metrics=[ + [train_rmse, test_rmse], + [train_r2, test_r2], + [int(regr.n_iter_), int(regr.n_iter_)], + ], data=[X_train, X_test], alg_instance=regr, ) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index 356bfed09..da08f7560 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -58,7 +58,7 @@ def main(): parser = argparse.ArgumentParser(description='scikit-learn PCA benchmark') parser.add_argument('--svd-solver', type=str, choices=['full'], default='full', help='SVD solver to use') - parser.add_argument('--n-components', type=int, default=None, + parser.add_argument('--n-components', type=float, default=None, help='Number of components to find') parser.add_argument('--whiten', action='store_true', default=False, help='Perform whitening') From fc30e9168d4ee0ee765d555e654bcece00c403d1 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Wed, 11 Aug 2021 13:39:46 +0300 Subject: [PATCH 16/17] remove knn_regr --- sklearn_bench/knn_regr.py | 100 -------------------------------------- 1 file changed, 100 deletions(-) delete mode 100755 sklearn_bench/knn_regr.py diff --git a/sklearn_bench/knn_regr.py b/sklearn_bench/knn_regr.py deleted file mode 100755 index afa7dafde..000000000 --- a/sklearn_bench/knn_regr.py +++ /dev/null @@ -1,100 +0,0 @@ -# =============================================================================== -# Copyright 2020-2021 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import argparse - -import bench -import numpy as np - - -def main(): - from sklearn.neighbors import KNeighborsRegressor - - # Load generated data - X_train, X_test, y_train, y_test = bench.load_data(params) - params.n_classes = len(np.unique(y_train)) - - # Create regression object - knn_regr = KNeighborsRegressor(n_neighbors=params.n_neighbors, - weights=params.weights, - algorithm=params.method, - metric=params.metric, - n_jobs=params.n_jobs) - - # Measure time and accuracy on fitting - train_time, _ = bench.measure_function_time( - knn_regr.fit, X_train, y_train, params=params) - if params.task == 'regression': - y_pred = knn_regr.predict(X_train) - train_rmse = bench.rmse_score(y_train, y_pred) - train_r2 = bench.r2_score(y_train, y_pred) - - # Measure time and accuracy on prediction - if params.task == 'regression': - predict_time, yp = bench.measure_function_time(knn_regr.predict, X_test, - params=params) - test_rmse = bench.rmse_score(y_test, yp) - test_r2 = bench.r2_score(y_test, yp) - else: - predict_time, _ = bench.measure_function_time(knn_regr.kneighbors, X_test, - params=params) - - if params.task == 'regression': - bench.print_output( - library='sklearn', - algorithm=knn_regr._fit_method + '_knn_regression', - stages=['training', 'prediction'], - params=params, - functions=['knn_regr.fit', 'knn_regr.predict'], - times=[train_time, predict_time], - metric_type=['rmse', 'r2_score'], - metrics=[[train_rmse, test_rmse], [train_r2, test_r2]], - data=[X_train, X_test], - alg_instance=knn_regr, - ) - else: - bench.print_output( - library='sklearn', - algorithm=knn_regr._fit_method + '_knn_search', - stages=['training', 'search'], - params=params, - functions=['knn_regr.fit', 'knn_regr.kneighbors'], - times=[train_time, predict_time], - metric_type=None, - metrics=[], - data=[X_train, X_test], - alg_instance=knn_regr, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description='scikit-learn kNN classifier benchmark') - - parser.add_argument('--task', default='regression', type=str, - choices=('search', 'regression'), - help='kNN task: search or regression') - parser.add_argument('--n-neighbors', default=5, type=int, - help='Number of neighbors to use') - parser.add_argument('--weights', type=str, default='uniform', - help='Weight function used in prediction') - parser.add_argument('--method', type=str, default='brute', - choices=('brute', 'kd_tree', 'ball_tree', 'auto'), - help='Algorithm used to compute the nearest neighbors') - parser.add_argument('--metric', type=str, default='euclidean', - help='Distance metric to use') - params = bench.parse_args(parser) - bench.run_with_context(params, main) From 9f59119a257802c273d4c0d05ab79fafe4eead2a Mon Sep 17 00:00:00 2001 From: Kulandin Denis Date: Wed, 11 Aug 2021 14:29:48 +0300 Subject: [PATCH 17/17] Update sklearn_bench/pca.py Co-authored-by: Ekaterina Mekhnetsova --- sklearn_bench/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn_bench/pca.py b/sklearn_bench/pca.py index da08f7560..7e4fcf366 100644 --- a/sklearn_bench/pca.py +++ b/sklearn_bench/pca.py @@ -59,7 +59,7 @@ def main(): parser.add_argument('--svd-solver', type=str, choices=['full'], default='full', help='SVD solver to use') parser.add_argument('--n-components', type=float, default=None, - help='Number of components to find') + help='The number of components to find') parser.add_argument('--whiten', action='store_true', default=False, help='Perform whitening') params = bench.parse_args(parser)