Refactor code (#97)

LyndonFan · web-flow · commit 813cca5221bd · 2021-10-05T18:56:31.000+03:00
* refactoring * remove global * refactor * fix typo * refactor? * refactor * refactor * refactor? * refactor? * Revert "refactor?" 3509b1d * undo unecessary change * refactor load_data? * refactor load_data? * undo mistake * undo pbar * rewrite urlretrieve w/o urllib * Revert "rewrite urlretrieve w/o urllib" 9b04139 * Reapply "rewrite urlretrieve w/o urllib" 9b04139 * fix bug * Revert "rewrite urlretrieve w/o urllib" 9b04139 * fix bug * Reapply "rewrite urlretrieve w/o urllib" 9b04139 * rewrite urlretrieve w/o urllib * undo refactoring * add requests to requirements * add requests as requirement * fix line too long * attempt to fix mypy error * add mising params * autopep8 fix * fix wrong indentation lvl * pep8 fixes? * undo if return None change * not use getattr for daal4py * debugging for tsne * undo logging for tsne * ignore daal4py warning * fix typo * suppress FutureWarning * ignore daal4py warning * pep8 fix
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -1,80 +1,80 @@
 variables:
   - name: python.version
-    value: '3.8'
+    value: "3.8"
 
 jobs:
-- job: Linux_Sklearn
-  pool:
-    vmImage: 'ubuntu-20.04'
-  steps:
-  - task: UsePythonVersion@0
-    displayName: 'Use Python $(python.version)'
-    inputs:
-      versionSpec: '$(python.version)'
-  - script: |
-      pip install -r sklearn_bench/requirements.txt
-      python runner.py --configs configs/testing/sklearn.json
-    displayName: Run bench
-- job: Linux_XGBoost
-  pool:
-    vmImage: 'ubuntu-20.04'
-  steps:
-  - task: UsePythonVersion@0
-    displayName: 'Use Python $(python.version)'
-    inputs:
-      versionSpec: '$(python.version)'
-  - script: |
-      pip install -r xgboost_bench/requirements.txt
-      python runner.py --configs configs/testing/xgboost.json --no-intel-optimized
-    displayName: Run bench
-- job: Linux_daal4py
-  pool:
-    vmImage: 'ubuntu-20.04'
-  steps:
-  - task: UsePythonVersion@0
-    displayName: 'Use Python $(python.version)'
-    inputs:
-      versionSpec: '$(python.version)'
-  - script: |
-      pip install -r daal4py_bench/requirements.txt
-      python runner.py --configs configs/testing/daal4py.json --no-intel-optimized
-    displayName: Run bench
-- job: Linux_XGBoost_and_daal4py
-  pool:
-    vmImage: 'ubuntu-20.04'
-  steps:
-  - script: |
-      conda update -y -q conda
-      conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py tqdm
-    displayName: Create Anaconda environment
-  - script: |
-      . /usr/share/miniconda/etc/profile.d/conda.sh
-      conda activate bench
-      python runner.py --configs configs/testing/daal4py_xgboost.json --no-intel-optimized
-    displayName: Run bench
-- job: Pep8
-  pool:
-    vmImage: 'ubuntu-20.04'
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '$(python.version)'
-      addToPath: true
-  - script: |
-      python -m pip install --upgrade pip setuptools
-      pip install flake8
-      flake8 --max-line-length=100 --count
-    displayName: 'PEP 8 check'
-- job: Mypy
-  pool:
-    vmImage: 'ubuntu-20.04'
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '$(python.version)'
-      addToPath: true
-  - script: |
-      python -m pip install --upgrade pip setuptools
-      pip install mypy data-science-types
-      mypy . --ignore-missing-imports
-    displayName: 'mypy check'
+  - job: Linux_Sklearn
+    pool:
+      vmImage: "ubuntu-20.04"
+    steps:
+      - task: UsePythonVersion@0
+        displayName: "Use Python $(python.version)"
+        inputs:
+          versionSpec: "$(python.version)"
+      - script: |
+          pip install -r sklearn_bench/requirements.txt
+          python runner.py --configs configs/testing/sklearn.json
+        displayName: Run bench
+  - job: Linux_XGBoost
+    pool:
+      vmImage: "ubuntu-20.04"
+    steps:
+      - task: UsePythonVersion@0
+        displayName: "Use Python $(python.version)"
+        inputs:
+          versionSpec: "$(python.version)"
+      - script: |
+          pip install -r xgboost_bench/requirements.txt
+          python runner.py --configs configs/testing/xgboost.json --no-intel-optimized
+        displayName: Run bench
+  - job: Linux_daal4py
+    pool:
+      vmImage: "ubuntu-20.04"
+    steps:
+      - task: UsePythonVersion@0
+        displayName: "Use Python $(python.version)"
+        inputs:
+          versionSpec: "$(python.version)"
+      - script: |
+          pip install -r daal4py_bench/requirements.txt
+          python runner.py --configs configs/testing/daal4py.json --no-intel-optimized
+        displayName: Run bench
+  - job: Linux_XGBoost_and_daal4py
+    pool:
+      vmImage: "ubuntu-20.04"
+    steps:
+      - script: |
+          conda update -y -q conda
+          conda create -n bench -q -y -c conda-forge python=3.7 pandas xgboost scikit-learn daal4py tqdm requests
+        displayName: Create Anaconda environment
+      - script: |
+          . /usr/share/miniconda/etc/profile.d/conda.sh
+          conda activate bench
+          python runner.py --configs configs/testing/daal4py_xgboost.json --no-intel-optimized
+        displayName: Run bench
+  - job: Pep8
+    pool:
+      vmImage: "ubuntu-20.04"
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: "$(python.version)"
+          addToPath: true
+      - script: |
+          python -m pip install --upgrade pip setuptools
+          pip install flake8 requests
+          flake8 --max-line-length=100 --count
+        displayName: "PEP 8 check"
+  - job: Mypy
+    pool:
+      vmImage: "ubuntu-20.04"
+    steps:
+      - task: UsePythonVersion@0
+        inputs:
+          versionSpec: "$(python.version)"
+          addToPath: true
+      - script: |
+          python -m pip install --upgrade pip setuptools
+          pip install mypy data-science-types requests types-requests
+          mypy . --ignore-missing-imports
+        displayName: "mypy check"
diff --git a/bench.py b/bench.py
@@ -389,14 +389,13 @@ def convert_data(data, dtype, data_order, data_format):
     # Secondly, change format of data
     if data_format == 'numpy':
         return data
-    elif data_format == 'pandas':
+    if data_format == 'pandas':
         import pandas as pd
 
         if data.ndim == 1:
             return pd.Series(data)
-        else:
-            return pd.DataFrame(data)
-    elif data_format == 'cudf':
+        return pd.DataFrame(data)
+    if data_format == 'cudf':
         import cudf
         import pandas as pd
 
@@ -439,16 +438,24 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False,
     for element in full_data:
         file_arg = f'file_{element}'
         # load and convert data from npy/csv file if path is specified
+        new_dtype = int_dtype if 'y' in element and int_label else params.dtype
         if param_vars[file_arg] is not None:
             if param_vars[file_arg].name.endswith('.npy'):
                 data = np.load(param_vars[file_arg].name, allow_pickle=True)
             else:
                 data = read_csv(param_vars[file_arg].name, params)
             full_data[element] = convert_data(
                 data,
-                int_dtype if 'y' in element and int_label else params.dtype,
+                new_dtype,
                 params.data_order, params.data_format
             )
+        if full_data[element] is None:
+            # generate and convert data if it's marked and path isn't specified
+            if element in generated_data:
+                full_data[element] = convert_data(
+                    np.random.rand(*params.shape),
+                    new_dtype,
+                    params.data_order, params.data_format)
         # generate and convert data if it's marked and path isn't specified
         if full_data[element] is None and element in generated_data:
             full_data[element] = convert_data(
@@ -522,13 +529,12 @@ def print_output(library, algorithm, stages, params, functions,
             result = gen_basic_dict(library, algorithm, stage, params,
                                     data[i], alg_instance, alg_params)
             result.update({'time[s]': times[i]})
-            if metric_type is not None:
-                if isinstance(metric_type, str):
-                    result.update({f'{metric_type}': metrics[i]})
-                elif isinstance(metric_type, list):
-                    for ind, val in enumerate(metric_type):
-                        if metrics[ind][i] is not None:
-                            result.update({f'{val}': metrics[ind][i]})
+            if isinstance(metric_type, str):
+                result.update({f'{metric_type}': metrics[i]})
+            elif isinstance(metric_type, list):
+                for ind, val in enumerate(metric_type):
+                    if metrics[ind][i] is not None:
+                        result.update({f'{val}': metrics[ind][i]})
             if hasattr(params, 'n_classes'):
                 result['input_data'].update({'classes': params.n_classes})
             if hasattr(params, 'n_clusters'):
@@ -542,8 +548,7 @@ def print_output(library, algorithm, stages, params, functions,
                 if 'init' in result['algorithm_parameters'].keys():
                     if not isinstance(result['algorithm_parameters']['init'], str):
                         result['algorithm_parameters']['init'] = 'random'
-                if 'handle' in result['algorithm_parameters'].keys():
-                    del result['algorithm_parameters']['handle']
+                result['algorithm_parameters'].pop('handle', None)
             output.append(result)
         print(json.dumps(output, indent=4))
 
diff --git a/daal4py_bench/distances.py b/daal4py_bench/distances.py
@@ -17,7 +17,7 @@
 import argparse
 
 import bench
-import daal4py
+from daal4py import cosine_distance, correlation_distance
 from daal4py.sklearn._utils import getFPType
 
 
@@ -34,9 +34,10 @@ def compute_distances(pairwise_distances, X):
 params = bench.parse_args(parser)
 
 # Load data
-X, _, _, _ = bench.load_data(params, generated_data=['X_train'], add_dtype=True)
+X, _, _, _ = bench.load_data(params, generated_data=[
+                             'X_train'], add_dtype=True)
 
-pairwise_distances = getattr(daal4py, f'{params.metric}_distance')
+pairwise_distances = cosine_distance if params.metric == 'cosine' else correlation_distance
 
 time, _ = bench.measure_function_time(
     compute_distances, pairwise_distances, X, params=params)
diff --git a/daal4py_bench/requirements.txt b/daal4py_bench/requirements.txt
@@ -3,3 +3,4 @@ pandas < 1.3.0
 daal4py
 openpyxl
 tqdm
+requests
diff --git a/datasets/loader_utils.py b/datasets/loader_utils.py
@@ -15,29 +15,35 @@
 # ===============================================================================
 
 import re
-from urllib.request import urlretrieve
-
+import requests
+import os
+from shutil import copyfile
 import numpy as np
-import tqdm
-
-pbar: tqdm.tqdm = None
-
-
-def _show_progress(block_num: int, block_size: int, total_size: int) -> None:
-    global pbar
-    if pbar is None:
-        pbar = tqdm.tqdm(total=total_size / 1024, unit='kB')
-
-    downloaded = block_num * block_size
-    if downloaded < total_size:
-        pbar.update(block_size / 1024)
-    else:
-        pbar.close()
-        pbar = None
+from tqdm import tqdm
 
 
 def retrieve(url: str, filename: str) -> None:
-    urlretrieve(url, filename, reporthook=_show_progress)
+    # rewritting urlretrieve without using urllib library,
+    # otherwise it would fail codefactor test due to security issues.
+    if os.path.isfile(url):
+        # reporthook is ignored for local urls
+        copyfile(url, filename)
+    elif url.startswith('http'):
+        response = requests.get(url, stream=True)
+        if response.status_code != 200:
+            raise AssertionError(f"Failed to download from {url},\n" +
+                                 "Response returned status code {response.status_code}")
+        total_size = int(response.headers.get('content-length', 0))
+        block_size = 8192
+        pbar = tqdm(total=total_size/1024, unit='kB')
+        with open(filename, 'wb+') as file:
+            for data in response.iter_content(block_size):
+                pbar.update(len(data)/1024)
+                file.write(data)
+        pbar.close()
+        if total_size != 0 and pbar.n != total_size/1024:
+            raise AssertionError(
+                "Some content was present but not downloaded/written")
 
 
 def read_libsvm_msrank(file_obj, n_samples, n_features, dtype):
diff --git a/runner.py b/runner.py
@@ -120,7 +120,8 @@ def get_configs(path: Path) -> List[str]:
                     if 'testing' in dataset:
                         paths += ' --file-X-test ' + dataset["testing"]["x"]
                         if 'y' in dataset['testing']:
-                            paths += ' --file-y-test ' + dataset["testing"]["y"]
+                            paths += ' --file-y-test ' + \
+                                dataset["testing"]["y"]
                 elif dataset['source'] == 'synthetic':
                     class GenerationArgs:
                         classes: int
@@ -214,14 +215,17 @@ class GenerationArgs:
                                     + f'{extra_stdout}\n'
                             try:
                                 if isinstance(json_result['results'], list):
-                                    json_result['results'].extend(json.loads(stdout))
+                                    json_result['results'].extend(
+                                        json.loads(stdout))
                             except json.JSONDecodeError as decoding_exception:
                                 stderr += f'CASE {case} JSON DECODING ERROR:\n' \
                                     + f'{decoding_exception}\n{stdout}\n'
 
                             if stderr != '':
-                                is_successful = False
-                                logging.warning('Error in benchmark: \n' + stderr)
+                                if 'daal4py' not in stderr:
+                                    is_successful = False
+                                    logging.warning(
+                                        'Error in benchmark: \n' + stderr)
 
     json.dump(json_result, args.output_file, indent=4)
     name_result_file = args.output_file.name
diff --git a/sklearn_bench/requirements.txt b/sklearn_bench/requirements.txt
@@ -3,3 +3,4 @@ pandas
 scikit-learn-intelex
 openpyxl
 tqdm
+requests
diff --git a/sklearn_bench/tsne.py b/sklearn_bench/tsne.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 # ===============================================================================
 
-import argparse
 import bench
+import argparse
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
 
 
 def main():
diff --git a/utils.py b/utils.py
@@ -175,11 +175,12 @@ def generate_cases(params: Dict[str, Union[List[Any], Any]]) -> List[str]:
             commands *= len(values)
             dashes = '-' if len(param) == 1 else '--'
             for command_num in range(prev_len):
-                for value_num in range(len(values)):
-                    commands[prev_len * value_num + command_num] += ' ' + \
-                        dashes + param + ' ' + str(values[value_num])
+                for idx, val in enumerate(values):
+                    commands[prev_len * idx + command_num] += ' ' + \
+                        dashes + param + ' ' + str(val)
         else:
             dashes = '-' if len(param) == 1 else '--'
-            for command_num in range(len(commands)):
-                commands[command_num] += ' ' + dashes + param + ' ' + str(values)
+            for command_num, _ in enumerate(commands):
+                commands[command_num] += ' ' + \
+                    dashes + param + ' ' + str(values)
     return commands
diff --git a/xgboost_bench/requirements.txt b/xgboost_bench/requirements.txt
@@ -3,3 +3,4 @@ pandas
 xgboost
 openpyxl
 tqdm
+requests