diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4e9cc4b5f..037052a9a 100755 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,80 +1,80 @@ variables: - name: python.version - value: '3.8' + value: "3.8" jobs: -- job: Linux_Sklearn - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - displayName: 'Use Python $(python.version)' - inputs: - versionSpec: '$(python.version)' - - script: | - pip install -r sklearn_bench/requirements.txt - python runner.py --configs configs/testing/sklearn.json - displayName: Run bench -- job: Linux_XGBoost - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - displayName: 'Use Python $(python.version)' - inputs: - versionSpec: '$(python.version)' - - script: | - pip install -r xgboost_bench/requirements.txt - python runner.py --configs configs/testing/xgboost.json --no-intel-optimized - displayName: Run bench -- job: Linux_daal4py - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - displayName: 'Use Python $(python.version)' - inputs: - versionSpec: '$(python.version)' - - script: | - pip install -r daal4py_bench/requirements.txt - python runner.py --configs configs/testing/daal4py.json --no-intel-optimized - displayName: Run bench -- job: Linux_XGBoost_and_daal4py - pool: - vmImage: 'ubuntu-20.04' - steps: - - script: | - conda update -y -q conda - conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py tqdm - displayName: Create Anaconda environment - - script: | - . /usr/share/miniconda/etc/profile.d/conda.sh - conda activate bench - python runner.py --configs configs/testing/daal4py_xgboost.json --no-intel-optimized - displayName: Run bench -- job: Pep8 - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install flake8 - flake8 --max-line-length=100 --count - displayName: 'PEP 8 check' -- job: Mypy - pool: - vmImage: 'ubuntu-20.04' - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - addToPath: true - - script: | - python -m pip install --upgrade pip setuptools - pip install mypy data-science-types - mypy . --ignore-missing-imports - displayName: 'mypy check' + - job: Linux_Sklearn + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(python.version)" + inputs: + versionSpec: "$(python.version)" + - script: | + pip install -r sklearn_bench/requirements.txt + python runner.py --configs configs/testing/sklearn.json + displayName: Run bench + - job: Linux_XGBoost + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(python.version)" + inputs: + versionSpec: "$(python.version)" + - script: | + pip install -r xgboost_bench/requirements.txt + python runner.py --configs configs/testing/xgboost.json --no-intel-optimized + displayName: Run bench + - job: Linux_daal4py + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + displayName: "Use Python $(python.version)" + inputs: + versionSpec: "$(python.version)" + - script: | + pip install -r daal4py_bench/requirements.txt + python runner.py --configs configs/testing/daal4py.json --no-intel-optimized + displayName: Run bench + - job: Linux_XGBoost_and_daal4py + pool: + vmImage: "ubuntu-20.04" + steps: + - script: | + conda update -y -q conda + conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py tqdm requests + displayName: Create Anaconda environment + - script: | + . /usr/share/miniconda/etc/profile.d/conda.sh + conda activate bench + python runner.py --configs configs/testing/daal4py_xgboost.json --no-intel-optimized + displayName: Run bench + - job: Pep8 + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: "$(python.version)" + addToPath: true + - script: | + python -m pip install --upgrade pip setuptools + pip install flake8 requests + flake8 --max-line-length=100 --count + displayName: "PEP 8 check" + - job: Mypy + pool: + vmImage: "ubuntu-20.04" + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: "$(python.version)" + addToPath: true + - script: | + python -m pip install --upgrade pip setuptools + pip install mypy data-science-types requests types-requests + mypy . --ignore-missing-imports + displayName: "mypy check" diff --git a/bench.py b/bench.py index f3f9ce5c3..c68e909a5 100644 --- a/bench.py +++ b/bench.py @@ -389,14 +389,13 @@ def convert_data(data, dtype, data_order, data_format): # Secondly, change format of data if data_format == 'numpy': return data - elif data_format == 'pandas': + if data_format == 'pandas': import pandas as pd if data.ndim == 1: return pd.Series(data) - else: - return pd.DataFrame(data) - elif data_format == 'cudf': + return pd.DataFrame(data) + if data_format == 'cudf': import cudf import pandas as pd @@ -439,6 +438,7 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, for element in full_data: file_arg = f'file_{element}' # load and convert data from npy/csv file if path is specified + new_dtype = int_dtype if 'y' in element and int_label else params.dtype if param_vars[file_arg] is not None: if param_vars[file_arg].name.endswith('.npy'): data = np.load(param_vars[file_arg].name, allow_pickle=True) @@ -446,9 +446,16 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, data = read_csv(param_vars[file_arg].name, params) full_data[element] = convert_data( data, - int_dtype if 'y' in element and int_label else params.dtype, + new_dtype, params.data_order, params.data_format ) + if full_data[element] is None: + # generate and convert data if it's marked and path isn't specified + if element in generated_data: + full_data[element] = convert_data( + np.random.rand(*params.shape), + new_dtype, + params.data_order, params.data_format) # generate and convert data if it's marked and path isn't specified if full_data[element] is None and element in generated_data: full_data[element] = convert_data( @@ -522,13 +529,12 @@ def print_output(library, algorithm, stages, params, functions, result = gen_basic_dict(library, algorithm, stage, params, data[i], alg_instance, alg_params) result.update({'time[s]': times[i]}) - if metric_type is not None: - if isinstance(metric_type, str): - result.update({f'{metric_type}': metrics[i]}) - elif isinstance(metric_type, list): - for ind, val in enumerate(metric_type): - if metrics[ind][i] is not None: - result.update({f'{val}': metrics[ind][i]}) + if isinstance(metric_type, str): + result.update({f'{metric_type}': metrics[i]}) + elif isinstance(metric_type, list): + for ind, val in enumerate(metric_type): + if metrics[ind][i] is not None: + result.update({f'{val}': metrics[ind][i]}) if hasattr(params, 'n_classes'): result['input_data'].update({'classes': params.n_classes}) if hasattr(params, 'n_clusters'): @@ -542,8 +548,7 @@ def print_output(library, algorithm, stages, params, functions, if 'init' in result['algorithm_parameters'].keys(): if not isinstance(result['algorithm_parameters']['init'], str): result['algorithm_parameters']['init'] = 'random' - if 'handle' in result['algorithm_parameters'].keys(): - del result['algorithm_parameters']['handle'] + result['algorithm_parameters'].pop('handle', None) output.append(result) print(json.dumps(output, indent=4)) diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py index 70408856d..82d2c5ec8 100644 --- a/daal4py_bench/distances.py +++ b/daal4py_bench/distances.py @@ -17,7 +17,7 @@ import argparse import bench -import daal4py +from daal4py import cosine_distance, correlation_distance from daal4py.sklearn._utils import getFPType @@ -34,9 +34,10 @@ def compute_distances(pairwise_distances, X): params = bench.parse_args(parser) # Load data -X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True) +X, _, _, _ = bench.load_data(params, generated_data=[ + 'X_train'], add_dtype=True) -pairwise_distances = getattr(daal4py, f'{params.metric}_distance') +pairwise_distances = cosine_distance if params.metric == 'cosine' else correlation_distance time, _ = bench.measure_function_time( compute_distances, pairwise_distances, X, params=params) diff --git a/daal4py_bench/requirements.txt b/daal4py_bench/requirements.txt index 1051f78ca..400c1ab7c 100644 --- a/daal4py_bench/requirements.txt +++ b/daal4py_bench/requirements.txt @@ -3,3 +3,4 @@ pandas < 1.3.0 daal4py openpyxl tqdm +requests \ No newline at end of file diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py index 29366eccb..4385e3dda 100755 --- a/datasets/loader_utils.py +++ b/datasets/loader_utils.py @@ -15,29 +15,35 @@ # =============================================================================== import re -from urllib.request import urlretrieve - +import requests +import os +from shutil import copyfile import numpy as np -import tqdm - -pbar: tqdm.tqdm = None - - -def _show_progress(block_num: int, block_size: int, total_size: int) -> None: - global pbar - if pbar is None: - pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') - - downloaded = block_num * block_size - if downloaded < total_size: - pbar.update(block_size / 1024) - else: - pbar.close() - pbar = None +from tqdm import tqdm def retrieve(url: str, filename: str) -> None: - urlretrieve(url, filename, reporthook=_show_progress) + # rewritting urlretrieve without using urllib library, + # otherwise it would fail codefactor test due to security issues. + if os.path.isfile(url): + # reporthook is ignored for local urls + copyfile(url, filename) + elif url.startswith('http'): + response = requests.get(url, stream=True) + if response.status_code != 200: + raise AssertionError(f"Failed to download from {url},\n" + + "Response returned status code {response.status_code}") + total_size = int(response.headers.get('content-length', 0)) + block_size = 8192 + pbar = tqdm(total=total_size/1024, unit='kB') + with open(filename, 'wb+') as file: + for data in response.iter_content(block_size): + pbar.update(len(data)/1024) + file.write(data) + pbar.close() + if total_size != 0 and pbar.n != total_size/1024: + raise AssertionError( + "Some content was present but not downloaded/written") def read_libsvm_msrank(file_obj, n_samples, n_features, dtype): diff --git a/runner.py b/runner.py index 99d992704..a8ab77e5d 100755 --- a/runner.py +++ b/runner.py @@ -120,7 +120,8 @@ def get_configs(path: Path) -> List[str]: if 'testing' in dataset: paths += ' --file-X-test ' + dataset["testing"]["x"] if 'y' in dataset['testing']: - paths += ' --file-y-test ' + dataset["testing"]["y"] + paths += ' --file-y-test ' + \ + dataset["testing"]["y"] elif dataset['source'] == 'synthetic': class GenerationArgs: classes: int @@ -214,14 +215,17 @@ class GenerationArgs: + f'{extra_stdout}\n' try: if isinstance(json_result['results'], list): - json_result['results'].extend(json.loads(stdout)) + json_result['results'].extend( + json.loads(stdout)) except json.JSONDecodeError as decoding_exception: stderr += f'CASE {case} JSON DECODING ERROR:\n' \ + f'{decoding_exception}\n{stdout}\n' if stderr != '': - is_successful = False - logging.warning('Error in benchmark: \n' + stderr) + if 'daal4py' not in stderr: + is_successful = False + logging.warning( + 'Error in benchmark: \n' + stderr) json.dump(json_result, args.output_file, indent=4) name_result_file = args.output_file.name diff --git a/sklearn_bench/requirements.txt b/sklearn_bench/requirements.txt index 28c7de80d..fa269e6cb 100755 --- a/sklearn_bench/requirements.txt +++ b/sklearn_bench/requirements.txt @@ -3,3 +3,4 @@ pandas scikit-learn-intelex openpyxl tqdm +requests \ No newline at end of file diff --git a/sklearn_bench/tsne.py b/sklearn_bench/tsne.py index 0083972a3..2d9f2d0aa 100644 --- a/sklearn_bench/tsne.py +++ b/sklearn_bench/tsne.py @@ -14,8 +14,10 @@ # limitations under the License. # =============================================================================== -import argparse import bench +import argparse +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) def main(): diff --git a/utils.py b/utils.py index 8c1720dcb..6e025b804 100755 --- a/utils.py +++ b/utils.py @@ -175,11 +175,12 @@ def generate_cases(params: Dict[str, Union[List[Any], Any]]) -> List[str]: commands *= len(values) dashes = '-' if len(param) == 1 else '--' for command_num in range(prev_len): - for value_num in range(len(values)): - commands[prev_len * value_num + command_num] += ' ' + \ - dashes + param + ' ' + str(values[value_num]) + for idx, val in enumerate(values): + commands[prev_len * idx + command_num] += ' ' + \ + dashes + param + ' ' + str(val) else: dashes = '-' if len(param) == 1 else '--' - for command_num in range(len(commands)): - commands[command_num] += ' ' + dashes + param + ' ' + str(values) + for command_num, _ in enumerate(commands): + commands[command_num] += ' ' + \ + dashes + param + ' ' + str(values) return commands diff --git a/xgboost_bench/requirements.txt b/xgboost_bench/requirements.txt index 79bc07cc5..3be916066 100755 --- a/xgboost_bench/requirements.txt +++ b/xgboost_bench/requirements.txt @@ -3,3 +3,4 @@ pandas xgboost openpyxl tqdm +requests \ No newline at end of file