From 88565f8a8c646fc8f83a242e52adc43d3c26979f Mon Sep 17 00:00:00 2001
From: dmitrii-kriukov <dmitrii.kriukov@intel.com>
Date: Wed, 29 Dec 2021 14:09:23 +0300
Subject: [PATCH 1/7] sizes in configs

---
 runner.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/runner.py b/runner.py
index 980e40b87..5e57a2794 100755
--- a/runner.py
+++ b/runner.py
@@ -58,6 +58,9 @@ def get_configs(path: Path) -> List[str]:
                         help='Available floating point data types'
                         'This parameter only marks dtype as available, '
                         'make sure to add the dtype parameter to the config file ')
+    parser.add_argument('--size', type=str, default="small medium large", nargs='+',
+                        choices=("small", "medium", "large"),
+                        help='Available dataset sizes')
     parser.add_argument('--no-intel-optimized', default=False, action='store_true',
                         help='Use Scikit-learn without Intel optimizations')
     parser.add_argument('--output-file', default='results.json',
@@ -105,6 +108,11 @@ def get_configs(path: Path) -> List[str]:
             params = common_params.copy()
             params.update(params_set.copy())
 
+            if 'size' in params:
+                if params['size'] not in args.size:
+                    continue
+                del params['size']
+
             device = []
             if 'device' not in params:
                 if 'sklearn' in params['lib']:

From e668d94480be5fc149e38ddb28df9a0649d00b46 Mon Sep 17 00:00:00 2001
From: dmitrii-kriukov <dmitrii.kriukov@intel.com>
Date: Wed, 12 Jan 2022 16:45:30 +0300
Subject: [PATCH 2/7] test config

---
 configs/skl_config.json | 71 +++++++++++++++++++++++++++++++++++++++--
 runner.py               |  8 ++---
 2 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/configs/skl_config.json b/configs/skl_config.json
index f3f1fa93f..486177949 100644
--- a/configs/skl_config.json
+++ b/configs/skl_config.json
@@ -19,6 +19,7 @@
                     }
                 }
             ],
+            "workload-size": "small",
             "time-method": "box_filter",
             "time-limit": 50,
             "n-clusters": 1000,
@@ -38,6 +39,7 @@
                     }
                 }
             ],
+            "workload-size": "small",
             "time-method": "box_filter",
             "time-limit": 50,
             "n-clusters": 5,
@@ -47,6 +49,7 @@
         },
         {
             "algorithm": "kmeans",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -66,6 +69,7 @@
         },
         {
             "algorithm": "pca",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -113,6 +117,7 @@
         {
             "algorithm": "df_clsf",
             "dtype": "float32",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -150,6 +155,7 @@
         },
         {
             "algorithm": "df_regr",
+            "workload-size": "small",
             "dtype": "float32",
             "dataset": [
                 {
@@ -184,6 +190,7 @@
         },
         {
             "algorithm": "ridge",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -206,6 +213,7 @@
         },
         {
             "algorithm": "linear",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -227,6 +235,7 @@
         },
         {
             "algorithm": "log_reg",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -270,6 +279,7 @@
         },
         {
             "algorithm": "svm",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -291,6 +301,7 @@
         },
         {
             "algorithm": "svm",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -312,6 +323,7 @@
         },
         {
             "algorithm": "svm",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -333,6 +345,7 @@
         },
         {
             "algorithm": "svm",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -354,6 +367,7 @@
         },
         {
             "algorithm": "svm",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -375,6 +389,7 @@
         },
         {
             "algorithm": "svm",
+            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",
@@ -396,6 +411,7 @@
         },
         {
             "algorithm": "nusvc",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -417,6 +433,7 @@
         },
         {
             "algorithm": "nusvc",
+            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",
@@ -438,6 +455,7 @@
         },
         {
             "algorithm": "nusvc",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -459,6 +477,7 @@
         },
         {
             "algorithm": "nusvc",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -480,6 +499,7 @@
         },
         {
             "algorithm": "svr",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -501,6 +521,7 @@
         },
         {
             "algorithm": "svr",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -522,6 +543,7 @@
         },
         {
             "algorithm": "nusvr",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -544,6 +566,7 @@
         },
         {
             "algorithm": "nusvr",
+            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -567,6 +590,7 @@
         },
         {
             "algorithm": "nusvr",
+            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",
@@ -589,6 +613,7 @@
         },
         {
             "algorithm": "dbscan",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -607,7 +632,13 @@
                     "training": {
                         "n_samples": 500000
                     }
-                },
+                }
+            ]
+        },
+        {
+            "algorithm": "dbscan",
+            "workload-size": "medium",
+            "dataset": [
                 {
                     "source": "synthetic",
                     "type": "blobs",
@@ -621,6 +652,7 @@
         },
         {
             "algorithm": "knn_clsf",
+            "workload-size": "small",
             "dtype": "float32",
             "dataset": [
                 {
@@ -658,7 +690,15 @@
                     "testing": {
                         "n_samples": 20000
                     }
-                },
+                }
+            ],
+            "method": ["brute", "kd_tree"]
+        },
+        {
+            "algorithm": "knn_clsf",
+            "workload-size": "small",
+            "dtype": "float32",
+            "dataset": [
                 {
                     "source": "synthetic",
                     "type": "classification",
@@ -672,10 +712,31 @@
                     }
                 }
             ],
-            "method": ["brute", "kd_tree"]
+            "method": "kd_tree"
+        },
+        {
+            "algorithm": "knn_clsf",
+            "workload-size": "medium",
+            "dtype": "float32",
+            "dataset": [
+                {
+                    "source": "synthetic",
+                    "type": "classification",
+                    "n_classes": 10,
+                    "n_features": 16,
+                    "training": {
+                        "n_samples": 250000
+                    },
+                    "testing": {
+                        "n_samples": 250000
+                    }
+                }
+            ],
+            "method": "brute"
         },
         {
             "algorithm": "train_test_split",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -711,6 +772,7 @@
         },
         {
             "algorithm": "train_test_split",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -730,6 +792,7 @@
         },
         {
             "algorithm": "lasso",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source":   "npy",
@@ -746,6 +809,7 @@
         },
         {
             "algorithm": "elasticnet",
+            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -768,6 +832,7 @@
         },
         {
             "algorithm": "tsne",
+            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",
diff --git a/runner.py b/runner.py
index 5e57a2794..130b3c01d 100755
--- a/runner.py
+++ b/runner.py
@@ -58,7 +58,7 @@ def get_configs(path: Path) -> List[str]:
                         help='Available floating point data types'
                         'This parameter only marks dtype as available, '
                         'make sure to add the dtype parameter to the config file ')
-    parser.add_argument('--size', type=str, default="small medium large", nargs='+',
+    parser.add_argument('--workload-size', type=str, default="small medium large", nargs='+',
                         choices=("small", "medium", "large"),
                         help='Available dataset sizes')
     parser.add_argument('--no-intel-optimized', default=False, action='store_true',
@@ -108,10 +108,10 @@ def get_configs(path: Path) -> List[str]:
             params = common_params.copy()
             params.update(params_set.copy())
 
-            if 'size' in params:
-                if params['size'] not in args.size:
+            if 'workload-size' in params:
+                if params['workload-size'] not in args.workload_size:
                     continue
-                del params['size']
+                del params['workload-size']
 
             device = []
             if 'device' not in params:

From 6272ea525c8155efb896525e6d1672e779fca3b3 Mon Sep 17 00:00:00 2001
From: dmitrii-kriukov <dmitrii.kriukov@intel.com>
Date: Mon, 17 Jan 2022 19:11:30 +0300
Subject: [PATCH 3/7] knn_svm

---
 configs/xpu/knn_clsf.json         | 162 +++++++++++++++++
 configs/xpu/knn_regr.json         |  69 +++++++
 configs/xpu/svm.json              | 192 ++++++++++++++++++++
 datasets/load_datasets.py         |  14 +-
 datasets/loader_classification.py | 289 +++++++++++++++++++++++++++++-
 5 files changed, 722 insertions(+), 4 deletions(-)
 create mode 100644 configs/xpu/knn_clsf.json
 create mode 100644 configs/xpu/knn_regr.json
 create mode 100644 configs/xpu/svm.json

diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json
new file mode 100644
index 000000000..2d72c4ade
--- /dev/null
+++ b/configs/xpu/knn_clsf.json
@@ -0,0 +1,162 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "knn_clsf",
+        "data-format": "pandas",
+        "data-order": "F",
+        "dtype": ["float32", "float64"],
+        "device": ["host", "cpu", "gpu", "none"]
+    },
+    "cases": [
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "epsilon_100K",
+                    "training":
+                    {
+                        "x": "data/epsilon_100K_x_train.npy",
+                        "y": "data/epsilon_100K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/epsilon_100K_x_test.npy",
+                        "y": "data/epsilon_100K_y_test.npy"
+                    }
+                }
+            ],
+            "n-neighbors": [2, 100]
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs_150K",
+                    "training":
+                    {
+                        "x": "data/higgs_150K_x_train.npy",
+                        "y": "data/higgs_150K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs_150K_x_test.npy",
+                        "y": "data/higgs_150K_y_test.npy"
+                    }
+                },
+                {
+                    "source": "npy",
+                    "name": "hepmass_150K",
+                    "training":
+                    {
+                        "x": "data/hepmass_150K_x_train.npy",
+                        "y": "data/hepmass_150K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/hepmass_150K_x_test.npy",
+                        "y": "data/hepmass_150K_y_test.npy"
+                    }
+                }
+            ],
+            "n-neighbors": [5, 100]
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "cifar",
+                    "training":
+                    {
+                        "x": "data/cifar_x_train.npy",
+                        "y": "data/cifar_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/cifar_x_test.npy",
+                        "y": "data/cifar_y_test.npy"
+                    }
+                }
+            ],
+            "n-neighbors": 7
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "mnist",
+                    "training":
+                    {
+                        "x": "data/mnist_x_train.npy",
+                        "y": "data/mnist_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/mnist_x_test.npy",
+                        "y": "data/mnist_y_test.npy"
+                    }
+                }
+            ],
+            "n-neighbors": 5
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "epsilon_100K",
+                    "training":
+                    {
+                        "x": "data/epsilon_100K_x_train.npy",
+                        "y": "data/epsilon_100K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/epsilon_100K_x_test.npy",
+                        "y": "data/epsilon_100K_y_test.npy"
+                    }
+                }
+            ],
+            "task": "search",
+            "n-neighbors": 2
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs_150K",
+                    "training":
+                    {
+                        "x": "data/higgs_150K_x_train.npy",
+                        "y": "data/higgs_150K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs_150K_x_test.npy",
+                        "y": "data/higgs_150K_y_test.npy"
+                    }
+                }
+            ],
+            "task": "search",
+            "n-neighbors": 5
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "cifar",
+                    "training":
+                    {
+                        "x": "data/cifar_x_train.npy",
+                        "y": "data/cifar_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/cifar_x_test.npy",
+                        "y": "data/cifar_y_test.npy"
+                    }
+                }
+            ],
+            "task": "search",
+            "n-neighbors": 7
+        }
+    ]
+}
diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json
new file mode 100644
index 000000000..ec1fbc9a9
--- /dev/null
+++ b/configs/xpu/knn_regr.json
@@ -0,0 +1,69 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "knn_regr",
+        "data-format": "pandas",
+        "data-order": "F",
+        "dtype": ["float32", "float64"],
+        "device": ["host", "cpu", "gpu", "none"]
+    },
+    "cases": [
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "year_prediction_msd",
+                    "training":
+                    {
+                        "x": "data/year_prediction_msd_x_train.npy",
+                        "y": "data/year_prediction_msd_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/year_prediction_msd_x_test.npy",
+                        "y": "data/year_prediction_msd_y_test.npy"
+                    }
+                }
+            ],
+            "n-neighbors": 2
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs_150K",
+                    "training":
+                    {
+                        "x": "data/higgs_150K_x_train.npy",
+                        "y": "data/higgs_150K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs_150K_x_test.npy",
+                        "y": "data/higgs_150K_y_test.npy"
+                    }
+                }
+            ],
+            "n-neighbors": 5
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "cifar",
+                    "training":
+                    {
+                        "x": "data/cifar_x_train.npy",
+                        "y": "data/cifar_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/cifar_x_test.npy",
+                        "y": "data/cifar_y_test.npy"
+                    }
+                }
+            ],
+            "n-neighbors": 7
+        }
+    ]
+}
diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json
new file mode 100644
index 000000000..a98377532
--- /dev/null
+++ b/configs/xpu/svm.json
@@ -0,0 +1,192 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "svm",
+        "data-format": "pandas",
+        "data-order": "F",
+        "dtype": ["float32", "float64"],
+        "device": ["host", "cpu", "gpu", "none"]
+    },
+    "cases": [
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "gisette",
+                    "training":
+                    {
+                        "x": "data/gisette_x_train.npy",
+                        "y": "data/gisette_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/gisette_x_test.npy",
+                        "y": "data/gisette_y_test.npy"
+                    }
+                }
+            ],
+            "C": 1.5e-3,
+            "kernel": "linear"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs_150K",
+                    "training":
+                    {
+                        "x": "data/higgs_150K_x_train.npy",
+                        "y": "data/higgs_150K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/higgs_150K_x_test.npy",
+                        "y": "data/higgs_150K_y_test.npy"
+                    }
+                }
+            ],
+            "C": 1.0,
+            "kernel": "linear"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "epsilon_80K",
+                    "training":
+                    {
+                        "x": "data/epsilon_80K_x_train.npy",
+                        "y": "data/epsilon_80K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/epsilon_80K_x_test.npy",
+                        "y": "data/epsilon_80K_y_test.npy"
+                    }
+                }
+            ],
+            "C": 1.0,
+            "kernel": "linear"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "cifar",
+                    "training":
+                    {
+                        "x": "data/cifar_x_train.npy",
+                        "y": "data/cifar_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/cifar_x_test.npy",
+                        "y": "data/cifar_y_test.npy"
+                    }
+                }
+            ],
+            "C": 1.0e-7,
+            "kernel": "linear"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "imb_drama",
+                    "training":
+                    {
+                        "x": "data/imb_drama_x_train.npy",
+                        "y": "data/imb_drama_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/imb_drama_x_train.npy",
+                        "y": "data/imb_drama_y_train.npy"
+                    }
+                }
+            ],
+            "C": 1e-3,
+            "kernel": "linear"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "epsilon_16K",
+                    "training":
+                    {
+                        "x": "data/epsilon_16K_x_train.npy",
+                        "y": "data/epsilon_16K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/epsilon_16K_x_test.npy",
+                        "y": "data/epsilon_16K_y_test.npy"
+                    }
+                }
+            ],
+            "C": 9.0e2,
+            "kernel": "rbf"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "covtype_binary",
+                    "training":
+                    {
+                        "x": "data/covtype_binary_x_train.npy",
+                        "y": "data/covtype_binary_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/covtype_binary_x_test.npy",
+                        "y": "data/covtype_binary_y_test.npy"
+                    }
+                }
+            ],
+            "C": 1000.0,
+            "kernel": "rbf"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "epsilon_80K",
+                    "training":
+                    {
+                        "x": "data/epsilon_80K_x_train.npy",
+                        "y": "data/epsilon_80K_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/epsilon_80K_x_test.npy",
+                        "y": "data/epsilon_80K_y_test.npy"
+                    }
+                }
+            ],
+            "C": 1000.0,
+            "kernel": "rbf"
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "imb_drama",
+                    "training":
+                    {
+                        "x": "data/imb_drama_x_train.npy",
+                        "y": "data/imb_drama_y_train.npy"
+                    },
+                    "testing":
+                    {
+                        "x": "data/imb_drama_x_train.npy",
+                        "y": "data/imb_drama_y_train.npy"
+                    }
+                }
+            ],
+            "C": 50,
+            "kernel": "rbf"
+        }
+    ]
+}
diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py
index 0a7874d92..fbd7685d4 100644
--- a/datasets/load_datasets.py
+++ b/datasets/load_datasets.py
@@ -22,8 +22,10 @@
 from typing import Callable, Dict
 
 from .loader_classification import (a_nine_a, airline, airline_ohe, bosch,
-                                    census, codrnanorm, creditcard, epsilon, fraud,
-                                    gisette, hepmass_150K, higgs, higgs_one_m, ijcnn,
+                                    census,  cifar, codrnanorm, covtype_binary, creditcard,
+                                    epsilon_16K, epsilon_80K, epsilon, epsilon_100K,
+                                    fraud, gisette, hepmass_150K,
+                                    higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama,
                                     klaverjas, santander, skin_segmentation, susy)
 from .loader_multiclass import (connect, covertype, covtype, letters, mlsr,
                                 mnist, msrank, plasticc, sensit)
@@ -40,19 +42,25 @@
     "bosch": bosch,
     "california_housing": california_housing,
     "census": census,
+    "cifar": cifar,
     "codrnanorm": codrnanorm,
     "connect": connect,
-    "covertype": covertype,
+    "covtype_binary": covtype_binary,
     "covtype": covtype,
     "creditcard": creditcard,
     "epsilon": epsilon,
+    "epsilon_16K": epsilon_16K,
+    "epsilon_80K": epsilon_80K,
+    "epsilon_100K": epsilon_100K,
     "fraud": fraud,
     "fried": fried,
     "gisette": gisette,
     "hepmass_150K": hepmass_150K,
     "higgs": higgs,
     "higgs1m": higgs_one_m,
+    "higgs_150K": higgs_150K,
     "ijcnn": ijcnn,
+    "imb_drama": imb_drama,
     "klaverjas": klaverjas,
     "letters": letters,
     "mlsr": mlsr,
diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py
index fc3cb892d..ffb84f12f 100644
--- a/datasets/loader_classification.py
+++ b/datasets/loader_classification.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.datasets import fetch_openml, load_svmlight_file
+from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype
 from sklearn.model_selection import train_test_split
 
 from .loader_utils import retrieve
@@ -261,6 +261,41 @@ def codrnanorm(dataset_dir: Path) -> bool:
     return True
 
 
+def covtype_binary(dataset_dir: Path) -> bool:
+    """
+    Cover type dataset from UCI machine learning repository
+    https://archive.ics.uci.edu/ml/datasets/covertype
+
+    y contains 7 unique class labels from 1 to 7 inclusive.
+    Classification task. n_classes = 7.
+    covtype X train dataset (464809, 54)
+    covtype y train dataset (464809, 1)
+    covtype X test dataset  (116203,  54)
+    covtype y test dataset  (116203,  1)
+    """
+    dataset_name = 'covtype_binary'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    nrows_train, nrows_test = 100000, 100000
+    logging.info(f'Started loading {dataset_name}')
+    X, y = fetch_covtype(return_X_y=True)  # pylint: disable=unexpected-keyword-arg
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+
+    y = (y > 3).astype(int)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
+                                                        train_size=nrows_train,
+                                                        test_size=nrows_test,
+                                                        shuffle=False
+                                                        )
+    for data, name in zip((X_train, X_test, y_train, y_test),
+                          ('x_train', 'x_test', 'y_train', 'y_test')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    logging.info(f'dataset {dataset_name} is ready.')
+    return True
+
+
 def creditcard(dataset_dir: Path) -> bool:
     """
     Classification task. n_classes = 2.
@@ -334,6 +369,150 @@ def epsilon(dataset_dir: Path) -> bool:
     return True
 
 
+def epsilon_16K(dataset_dir: Path) -> bool:
+    """
+    Epsilon dataset
+    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
+
+    Classification task. n_classes = 2.
+    epsilon_100K x train dataset (16000, 2000)
+    epsilon_100K y train dataset (16000, 1)
+    epsilon_100K x test dataset (16000, 2000)
+    epsilon_100K y test dataset (16000, 1)
+    """
+    dataset_name = 'epsilon_16K'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
+                '/epsilon_normalized.bz2'
+    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
+               '/epsilon_normalized.t.bz2'
+    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
+    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
+
+    num_train, num_test, dtype = 16000, 16000, np.float32
+    if not os.path.isfile(local_url_train):
+        logging.info(f'Started loading {dataset_name}, train')
+        retrieve(url_train, local_url_train)
+    if not os.path.isfile(local_url_test):
+        logging.info(f'Started loading {dataset_name}, test')
+        retrieve(url_test, local_url_test)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    X_train, y_train = load_svmlight_file(local_url_train,
+                                          dtype=dtype)
+    X_test, y_test = load_svmlight_file(local_url_test,
+                                        dtype=dtype)
+    X_train = X_train.toarray()[:num_train]
+    X_test = X_test.toarray()[:num_test]
+    y_train = y_train[:num_train]
+    y_train[y_train <= 0] = 0
+    y_test = y_test[:num_test]
+    y_test[y_test <= 0] = 0
+
+    for data, name in zip((X_train, X_test, y_train, y_test),
+                          ('x_train', 'x_test', 'y_train', 'y_test')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    logging.info(f'dataset {dataset_name} is ready.')
+    return True
+
+
+def epsilon_100K(dataset_dir: Path) -> bool:
+    """
+    Epsilon dataset
+    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
+
+    Classification task. n_classes = 2.
+    epsilon_100K x train dataset (50000, 2000)
+    epsilon_100K y train dataset (50000, 1)
+    epsilon_100K x test dataset (50000, 2000)
+    epsilon_100K y test dataset (50000, 1)
+    """
+    dataset_name = 'epsilon_100K'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
+                '/epsilon_normalized.bz2'
+    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
+               '/epsilon_normalized.t.bz2'
+    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
+    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
+
+    num_train, num_test, dtype = 50000, 50000, np.float32
+    if not os.path.isfile(local_url_train):
+        logging.info(f'Started loading {dataset_name}, train')
+        retrieve(url_train, local_url_train)
+    if not os.path.isfile(local_url_test):
+        logging.info(f'Started loading {dataset_name}, test')
+        retrieve(url_test, local_url_test)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    X_train, y_train = load_svmlight_file(local_url_train,
+                                          dtype=dtype)
+    X_test, y_test = load_svmlight_file(local_url_test,
+                                        dtype=dtype)
+    X_train = X_train.toarray()[:num_train]
+    X_test = X_test.toarray()[:num_test]
+    y_train = y_train[:num_train]
+    y_train[y_train <= 0] = 0
+    y_test = y_test[:num_test]
+    y_test[y_test <= 0] = 0
+
+    for data, name in zip((X_train, X_test, y_train, y_test),
+                          ('x_train', 'x_test', 'y_train', 'y_test')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    logging.info(f'dataset {dataset_name} is ready.')
+    return True
+
+
+def epsilon_80K(dataset_dir: Path) -> bool:
+    """
+    Epsilon dataset
+    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
+
+    Classification task. n_classes = 2.
+    epsilon_100K x train dataset (80000, 2000)
+    epsilon_100K y train dataset (80000, 1)
+    epsilon_100K x test dataset (80000, 2000)
+    epsilon_100K y test dataset (80000, 1)
+    """
+    dataset_name = 'epsilon_80K'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
+                '/epsilon_normalized.bz2'
+    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
+               '/epsilon_normalized.t.bz2'
+    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
+    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
+
+    num_train, num_test, dtype = 80000, 80000, np.float32
+    if not os.path.isfile(local_url_train):
+        logging.info(f'Started loading {dataset_name}, train')
+        retrieve(url_train, local_url_train)
+    if not os.path.isfile(local_url_test):
+        logging.info(f'Started loading {dataset_name}, test')
+        retrieve(url_test, local_url_test)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    X_train, y_train = load_svmlight_file(local_url_train,
+                                          dtype=dtype)
+    X_test, y_test = load_svmlight_file(local_url_test,
+                                        dtype=dtype)
+    X_train = X_train.toarray()[:num_train]
+    X_test = X_test.toarray()[:num_test]
+    y_train = y_train[:num_train]
+    y_train[y_train <= 0] = 0
+    y_test = y_test[:num_test]
+    y_test[y_test <= 0] = 0
+
+    for data, name in zip((X_train, X_test, y_train, y_test),
+                          ('x_train', 'x_test', 'y_train', 'y_test')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    logging.info(f'dataset {dataset_name} is ready.')
+    return True
+
+
 def fraud(dataset_dir: Path) -> bool:
     """
     Credit Card Fraud Detection contest
@@ -576,6 +755,46 @@ def higgs_one_m(dataset_dir: Path) -> bool:
     return True
 
 
+def higgs_150K(dataset_dir: Path) -> bool:
+    """
+    Higgs dataset from UCI machine learning repository
+    https://archive.ics.uci.edu/ml/datasets/HIGGS
+
+    Classification task. n_classes = 2.
+    higgs_150K X train dataset (100000, 28)
+    higgs_150K y train dataset (50000, 1)
+    higgs_150K X test dataset  (100000,  28)
+    higgs_150K y test dataset  (50000,  1)
+    """
+    dataset_name = 'higgs_150K'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
+    local_url = os.path.join(dataset_dir, os.path.basename(url))
+    if not os.path.isfile(local_url):
+        logging.info(f'Started loading {dataset_name}')
+        retrieve(url, local_url)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+
+    nrows_train, nrows_test, dtype = 100000, 50000, np.float32
+    data: Any = pd.read_csv(local_url, delimiter=",", header=None,
+                            compression="gzip", dtype=dtype,
+                            nrows=nrows_train + nrows_test)
+
+    X = data[data.columns[1:]]
+    y =  data[data.columns[0:1]]
+
+    x_train, x_test, y_train, y_test = train_test_split(
+        X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False)
+
+    for data, name in zip((x_train, x_test, y_train, y_test),
+                          ('x_train', 'x_test', 'y_train', 'y_test')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    logging.info(f'dataset {dataset_name} is ready.')
+    return True
+
+
 def ijcnn(dataset_dir: Path) -> bool:
     """
     Author: Danil Prokhorov.
@@ -611,6 +830,28 @@ def ijcnn(dataset_dir: Path) -> bool:
     logging.info(f'dataset {dataset_name} is ready.')
     return True
 
+def imb_drama(dataset_dir: Path) -> bool:
+    """
+    imdb_drama dataset from OpenML Datasets (
+    https://www.openml.org/d/273)
+
+    Classification task.
+    Number of features:  1001
+    Number of instances: 120919
+    """
+    dataset_name = 'imb_drama'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True,
+                           as_frame=False, data_home=dataset_dir)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    for data, name in zip((x_train.todense(), y_train),
+                          ('x_train', 'y_train')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    logging.info(f'dataset {dataset_name} is ready.')
+    return True
+
 
 def klaverjas(dataset_dir: Path) -> bool:
     """
@@ -726,3 +967,49 @@ def susy(dataset_dir: Path) -> bool:
         np.save(os.path.join(dataset_dir, filename), data)
     logging.info(f'dataset {dataset_name} is ready.')
     return True
+
+
+def cifar(dataset_dir: Path) -> bool:
+    """
+    Cifar dataset from LIBSVM Datasets (
+    https://www.cs.toronto.edu/~kriz/cifar.html#cifar)
+    TaskType: Classification
+    cifar x train dataset (50000, 3072)
+    cifar y train dataset (50000, 1)
+    cifar x test dataset (10000, 3072)
+    cifar y test dataset (10000, 1)
+    """
+    dataset_name = 'cifar'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2'
+    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2'
+    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
+    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
+
+    if not os.path.isfile(local_url_train):
+        logging.info(f'Started loading {dataset_name}, train')
+        retrieve(url_train, local_url_train)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    x_train, y_train = load_svmlight_file(local_url_train,
+                                        dtype=np.float32)
+
+    if not os.path.isfile(local_url_test):
+        logging.info(f'Started loading {dataset_name}, test')
+        retrieve(url_test, local_url_test)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    x_test, y_test = load_svmlight_file(local_url_test,
+                                        dtype=np.float32)
+
+    x_train = x_train.toarray()
+    y_train[y_train <= 0] = 0
+
+    x_test = x_test.toarray()
+    y_test[y_test <= 0] = 0
+
+    for data, name in zip((x_train, x_test, y_train, y_test),
+                          ('x_train', 'x_test', 'y_train', 'y_test')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    return True
+

From 81868d37ec55ec0be33d1ebe62849e7c97305eb4 Mon Sep 17 00:00:00 2001
From: dmitrii-kriukov <dmitrii.kriukov@intel.com>
Date: Mon, 17 Jan 2022 19:16:10 +0300
Subject: [PATCH 4/7] Revert "knn_svm"

This reverts commit 6272ea525c8155efb896525e6d1672e779fca3b3.
---
 configs/xpu/knn_clsf.json         | 162 -----------------
 configs/xpu/knn_regr.json         |  69 -------
 configs/xpu/svm.json              | 192 --------------------
 datasets/load_datasets.py         |  14 +-
 datasets/loader_classification.py | 289 +-----------------------------
 5 files changed, 4 insertions(+), 722 deletions(-)
 delete mode 100644 configs/xpu/knn_clsf.json
 delete mode 100644 configs/xpu/knn_regr.json
 delete mode 100644 configs/xpu/svm.json

diff --git a/configs/xpu/knn_clsf.json b/configs/xpu/knn_clsf.json
deleted file mode 100644
index 2d72c4ade..000000000
--- a/configs/xpu/knn_clsf.json
+++ /dev/null
@@ -1,162 +0,0 @@
-{
-    "common": {
-        "lib": "sklearn",
-        "algorithm": "knn_clsf",
-        "data-format": "pandas",
-        "data-order": "F",
-        "dtype": ["float32", "float64"],
-        "device": ["host", "cpu", "gpu", "none"]
-    },
-    "cases": [
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "epsilon_100K",
-                    "training":
-                    {
-                        "x": "data/epsilon_100K_x_train.npy",
-                        "y": "data/epsilon_100K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/epsilon_100K_x_test.npy",
-                        "y": "data/epsilon_100K_y_test.npy"
-                    }
-                }
-            ],
-            "n-neighbors": [2, 100]
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "higgs_150K",
-                    "training":
-                    {
-                        "x": "data/higgs_150K_x_train.npy",
-                        "y": "data/higgs_150K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/higgs_150K_x_test.npy",
-                        "y": "data/higgs_150K_y_test.npy"
-                    }
-                },
-                {
-                    "source": "npy",
-                    "name": "hepmass_150K",
-                    "training":
-                    {
-                        "x": "data/hepmass_150K_x_train.npy",
-                        "y": "data/hepmass_150K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/hepmass_150K_x_test.npy",
-                        "y": "data/hepmass_150K_y_test.npy"
-                    }
-                }
-            ],
-            "n-neighbors": [5, 100]
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "cifar",
-                    "training":
-                    {
-                        "x": "data/cifar_x_train.npy",
-                        "y": "data/cifar_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/cifar_x_test.npy",
-                        "y": "data/cifar_y_test.npy"
-                    }
-                }
-            ],
-            "n-neighbors": 7
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "mnist",
-                    "training":
-                    {
-                        "x": "data/mnist_x_train.npy",
-                        "y": "data/mnist_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/mnist_x_test.npy",
-                        "y": "data/mnist_y_test.npy"
-                    }
-                }
-            ],
-            "n-neighbors": 5
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "epsilon_100K",
-                    "training":
-                    {
-                        "x": "data/epsilon_100K_x_train.npy",
-                        "y": "data/epsilon_100K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/epsilon_100K_x_test.npy",
-                        "y": "data/epsilon_100K_y_test.npy"
-                    }
-                }
-            ],
-            "task": "search",
-            "n-neighbors": 2
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "higgs_150K",
-                    "training":
-                    {
-                        "x": "data/higgs_150K_x_train.npy",
-                        "y": "data/higgs_150K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/higgs_150K_x_test.npy",
-                        "y": "data/higgs_150K_y_test.npy"
-                    }
-                }
-            ],
-            "task": "search",
-            "n-neighbors": 5
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "cifar",
-                    "training":
-                    {
-                        "x": "data/cifar_x_train.npy",
-                        "y": "data/cifar_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/cifar_x_test.npy",
-                        "y": "data/cifar_y_test.npy"
-                    }
-                }
-            ],
-            "task": "search",
-            "n-neighbors": 7
-        }
-    ]
-}
diff --git a/configs/xpu/knn_regr.json b/configs/xpu/knn_regr.json
deleted file mode 100644
index ec1fbc9a9..000000000
--- a/configs/xpu/knn_regr.json
+++ /dev/null
@@ -1,69 +0,0 @@
-{
-    "common": {
-        "lib": "sklearn",
-        "algorithm": "knn_regr",
-        "data-format": "pandas",
-        "data-order": "F",
-        "dtype": ["float32", "float64"],
-        "device": ["host", "cpu", "gpu", "none"]
-    },
-    "cases": [
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "year_prediction_msd",
-                    "training":
-                    {
-                        "x": "data/year_prediction_msd_x_train.npy",
-                        "y": "data/year_prediction_msd_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/year_prediction_msd_x_test.npy",
-                        "y": "data/year_prediction_msd_y_test.npy"
-                    }
-                }
-            ],
-            "n-neighbors": 2
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "higgs_150K",
-                    "training":
-                    {
-                        "x": "data/higgs_150K_x_train.npy",
-                        "y": "data/higgs_150K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/higgs_150K_x_test.npy",
-                        "y": "data/higgs_150K_y_test.npy"
-                    }
-                }
-            ],
-            "n-neighbors": 5
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "cifar",
-                    "training":
-                    {
-                        "x": "data/cifar_x_train.npy",
-                        "y": "data/cifar_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/cifar_x_test.npy",
-                        "y": "data/cifar_y_test.npy"
-                    }
-                }
-            ],
-            "n-neighbors": 7
-        }
-    ]
-}
diff --git a/configs/xpu/svm.json b/configs/xpu/svm.json
deleted file mode 100644
index a98377532..000000000
--- a/configs/xpu/svm.json
+++ /dev/null
@@ -1,192 +0,0 @@
-{
-    "common": {
-        "lib": "sklearn",
-        "algorithm": "svm",
-        "data-format": "pandas",
-        "data-order": "F",
-        "dtype": ["float32", "float64"],
-        "device": ["host", "cpu", "gpu", "none"]
-    },
-    "cases": [
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "gisette",
-                    "training":
-                    {
-                        "x": "data/gisette_x_train.npy",
-                        "y": "data/gisette_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/gisette_x_test.npy",
-                        "y": "data/gisette_y_test.npy"
-                    }
-                }
-            ],
-            "C": 1.5e-3,
-            "kernel": "linear"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "higgs_150K",
-                    "training":
-                    {
-                        "x": "data/higgs_150K_x_train.npy",
-                        "y": "data/higgs_150K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/higgs_150K_x_test.npy",
-                        "y": "data/higgs_150K_y_test.npy"
-                    }
-                }
-            ],
-            "C": 1.0,
-            "kernel": "linear"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "epsilon_80K",
-                    "training":
-                    {
-                        "x": "data/epsilon_80K_x_train.npy",
-                        "y": "data/epsilon_80K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/epsilon_80K_x_test.npy",
-                        "y": "data/epsilon_80K_y_test.npy"
-                    }
-                }
-            ],
-            "C": 1.0,
-            "kernel": "linear"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "cifar",
-                    "training":
-                    {
-                        "x": "data/cifar_x_train.npy",
-                        "y": "data/cifar_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/cifar_x_test.npy",
-                        "y": "data/cifar_y_test.npy"
-                    }
-                }
-            ],
-            "C": 1.0e-7,
-            "kernel": "linear"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "imb_drama",
-                    "training":
-                    {
-                        "x": "data/imb_drama_x_train.npy",
-                        "y": "data/imb_drama_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/imb_drama_x_train.npy",
-                        "y": "data/imb_drama_y_train.npy"
-                    }
-                }
-            ],
-            "C": 1e-3,
-            "kernel": "linear"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "epsilon_16K",
-                    "training":
-                    {
-                        "x": "data/epsilon_16K_x_train.npy",
-                        "y": "data/epsilon_16K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/epsilon_16K_x_test.npy",
-                        "y": "data/epsilon_16K_y_test.npy"
-                    }
-                }
-            ],
-            "C": 9.0e2,
-            "kernel": "rbf"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "covtype_binary",
-                    "training":
-                    {
-                        "x": "data/covtype_binary_x_train.npy",
-                        "y": "data/covtype_binary_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/covtype_binary_x_test.npy",
-                        "y": "data/covtype_binary_y_test.npy"
-                    }
-                }
-            ],
-            "C": 1000.0,
-            "kernel": "rbf"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "epsilon_80K",
-                    "training":
-                    {
-                        "x": "data/epsilon_80K_x_train.npy",
-                        "y": "data/epsilon_80K_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/epsilon_80K_x_test.npy",
-                        "y": "data/epsilon_80K_y_test.npy"
-                    }
-                }
-            ],
-            "C": 1000.0,
-            "kernel": "rbf"
-        },
-        {
-            "dataset": [
-                {
-                    "source": "npy",
-                    "name": "imb_drama",
-                    "training":
-                    {
-                        "x": "data/imb_drama_x_train.npy",
-                        "y": "data/imb_drama_y_train.npy"
-                    },
-                    "testing":
-                    {
-                        "x": "data/imb_drama_x_train.npy",
-                        "y": "data/imb_drama_y_train.npy"
-                    }
-                }
-            ],
-            "C": 50,
-            "kernel": "rbf"
-        }
-    ]
-}
diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py
index fbd7685d4..0a7874d92 100644
--- a/datasets/load_datasets.py
+++ b/datasets/load_datasets.py
@@ -22,10 +22,8 @@
 from typing import Callable, Dict
 
 from .loader_classification import (a_nine_a, airline, airline_ohe, bosch,
-                                    census,  cifar, codrnanorm, covtype_binary, creditcard,
-                                    epsilon_16K, epsilon_80K, epsilon, epsilon_100K,
-                                    fraud, gisette, hepmass_150K,
-                                    higgs, higgs_one_m, higgs_150K, ijcnn, imb_drama,
+                                    census, codrnanorm, creditcard, epsilon, fraud,
+                                    gisette, hepmass_150K, higgs, higgs_one_m, ijcnn,
                                     klaverjas, santander, skin_segmentation, susy)
 from .loader_multiclass import (connect, covertype, covtype, letters, mlsr,
                                 mnist, msrank, plasticc, sensit)
@@ -42,25 +40,19 @@
     "bosch": bosch,
     "california_housing": california_housing,
     "census": census,
-    "cifar": cifar,
     "codrnanorm": codrnanorm,
     "connect": connect,
-    "covtype_binary": covtype_binary,
+    "covertype": covertype,
     "covtype": covtype,
     "creditcard": creditcard,
     "epsilon": epsilon,
-    "epsilon_16K": epsilon_16K,
-    "epsilon_80K": epsilon_80K,
-    "epsilon_100K": epsilon_100K,
     "fraud": fraud,
     "fried": fried,
     "gisette": gisette,
     "hepmass_150K": hepmass_150K,
     "higgs": higgs,
     "higgs1m": higgs_one_m,
-    "higgs_150K": higgs_150K,
     "ijcnn": ijcnn,
-    "imb_drama": imb_drama,
     "klaverjas": klaverjas,
     "letters": letters,
     "mlsr": mlsr,
diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py
index ffb84f12f..fc3cb892d 100644
--- a/datasets/loader_classification.py
+++ b/datasets/loader_classification.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.datasets import fetch_openml, load_svmlight_file, fetch_covtype
+from sklearn.datasets import fetch_openml, load_svmlight_file
 from sklearn.model_selection import train_test_split
 
 from .loader_utils import retrieve
@@ -261,41 +261,6 @@ def codrnanorm(dataset_dir: Path) -> bool:
     return True
 
 
-def covtype_binary(dataset_dir: Path) -> bool:
-    """
-    Cover type dataset from UCI machine learning repository
-    https://archive.ics.uci.edu/ml/datasets/covertype
-
-    y contains 7 unique class labels from 1 to 7 inclusive.
-    Classification task. n_classes = 7.
-    covtype X train dataset (464809, 54)
-    covtype y train dataset (464809, 1)
-    covtype X test dataset  (116203,  54)
-    covtype y test dataset  (116203,  1)
-    """
-    dataset_name = 'covtype_binary'
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    nrows_train, nrows_test = 100000, 100000
-    logging.info(f'Started loading {dataset_name}')
-    X, y = fetch_covtype(return_X_y=True)  # pylint: disable=unexpected-keyword-arg
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-
-    y = (y > 3).astype(int)
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
-                                                        train_size=nrows_train,
-                                                        test_size=nrows_test,
-                                                        shuffle=False
-                                                        )
-    for data, name in zip((X_train, X_test, y_train, y_test),
-                          ('x_train', 'x_test', 'y_train', 'y_test')):
-        filename = f'{dataset_name}_{name}.npy'
-        np.save(os.path.join(dataset_dir, filename), data)
-    logging.info(f'dataset {dataset_name} is ready.')
-    return True
-
-
 def creditcard(dataset_dir: Path) -> bool:
     """
     Classification task. n_classes = 2.
@@ -369,150 +334,6 @@ def epsilon(dataset_dir: Path) -> bool:
     return True
 
 
-def epsilon_16K(dataset_dir: Path) -> bool:
-    """
-    Epsilon dataset
-    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
-
-    Classification task. n_classes = 2.
-    epsilon_100K x train dataset (16000, 2000)
-    epsilon_100K y train dataset (16000, 1)
-    epsilon_100K x test dataset (16000, 2000)
-    epsilon_100K y test dataset (16000, 1)
-    """
-    dataset_name = 'epsilon_16K'
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
-                '/epsilon_normalized.bz2'
-    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
-               '/epsilon_normalized.t.bz2'
-    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
-    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
-
-    num_train, num_test, dtype = 16000, 16000, np.float32
-    if not os.path.isfile(local_url_train):
-        logging.info(f'Started loading {dataset_name}, train')
-        retrieve(url_train, local_url_train)
-    if not os.path.isfile(local_url_test):
-        logging.info(f'Started loading {dataset_name}, test')
-        retrieve(url_test, local_url_test)
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-    X_train, y_train = load_svmlight_file(local_url_train,
-                                          dtype=dtype)
-    X_test, y_test = load_svmlight_file(local_url_test,
-                                        dtype=dtype)
-    X_train = X_train.toarray()[:num_train]
-    X_test = X_test.toarray()[:num_test]
-    y_train = y_train[:num_train]
-    y_train[y_train <= 0] = 0
-    y_test = y_test[:num_test]
-    y_test[y_test <= 0] = 0
-
-    for data, name in zip((X_train, X_test, y_train, y_test),
-                          ('x_train', 'x_test', 'y_train', 'y_test')):
-        filename = f'{dataset_name}_{name}.npy'
-        np.save(os.path.join(dataset_dir, filename), data)
-    logging.info(f'dataset {dataset_name} is ready.')
-    return True
-
-
-def epsilon_100K(dataset_dir: Path) -> bool:
-    """
-    Epsilon dataset
-    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
-
-    Classification task. n_classes = 2.
-    epsilon_100K x train dataset (50000, 2000)
-    epsilon_100K y train dataset (50000, 1)
-    epsilon_100K x test dataset (50000, 2000)
-    epsilon_100K y test dataset (50000, 1)
-    """
-    dataset_name = 'epsilon_100K'
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
-                '/epsilon_normalized.bz2'
-    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
-               '/epsilon_normalized.t.bz2'
-    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
-    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
-
-    num_train, num_test, dtype = 50000, 50000, np.float32
-    if not os.path.isfile(local_url_train):
-        logging.info(f'Started loading {dataset_name}, train')
-        retrieve(url_train, local_url_train)
-    if not os.path.isfile(local_url_test):
-        logging.info(f'Started loading {dataset_name}, test')
-        retrieve(url_test, local_url_test)
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-    X_train, y_train = load_svmlight_file(local_url_train,
-                                          dtype=dtype)
-    X_test, y_test = load_svmlight_file(local_url_test,
-                                        dtype=dtype)
-    X_train = X_train.toarray()[:num_train]
-    X_test = X_test.toarray()[:num_test]
-    y_train = y_train[:num_train]
-    y_train[y_train <= 0] = 0
-    y_test = y_test[:num_test]
-    y_test[y_test <= 0] = 0
-
-    for data, name in zip((X_train, X_test, y_train, y_test),
-                          ('x_train', 'x_test', 'y_train', 'y_test')):
-        filename = f'{dataset_name}_{name}.npy'
-        np.save(os.path.join(dataset_dir, filename), data)
-    logging.info(f'dataset {dataset_name} is ready.')
-    return True
-
-
-def epsilon_80K(dataset_dir: Path) -> bool:
-    """
-    Epsilon dataset
-    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
-
-    Classification task. n_classes = 2.
-    epsilon_100K x train dataset (80000, 2000)
-    epsilon_100K y train dataset (80000, 1)
-    epsilon_100K x test dataset (80000, 2000)
-    epsilon_100K y test dataset (80000, 1)
-    """
-    dataset_name = 'epsilon_80K'
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
-                '/epsilon_normalized.bz2'
-    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
-               '/epsilon_normalized.t.bz2'
-    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
-    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
-
-    num_train, num_test, dtype = 80000, 80000, np.float32
-    if not os.path.isfile(local_url_train):
-        logging.info(f'Started loading {dataset_name}, train')
-        retrieve(url_train, local_url_train)
-    if not os.path.isfile(local_url_test):
-        logging.info(f'Started loading {dataset_name}, test')
-        retrieve(url_test, local_url_test)
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-    X_train, y_train = load_svmlight_file(local_url_train,
-                                          dtype=dtype)
-    X_test, y_test = load_svmlight_file(local_url_test,
-                                        dtype=dtype)
-    X_train = X_train.toarray()[:num_train]
-    X_test = X_test.toarray()[:num_test]
-    y_train = y_train[:num_train]
-    y_train[y_train <= 0] = 0
-    y_test = y_test[:num_test]
-    y_test[y_test <= 0] = 0
-
-    for data, name in zip((X_train, X_test, y_train, y_test),
-                          ('x_train', 'x_test', 'y_train', 'y_test')):
-        filename = f'{dataset_name}_{name}.npy'
-        np.save(os.path.join(dataset_dir, filename), data)
-    logging.info(f'dataset {dataset_name} is ready.')
-    return True
-
-
 def fraud(dataset_dir: Path) -> bool:
     """
     Credit Card Fraud Detection contest
@@ -755,46 +576,6 @@ def higgs_one_m(dataset_dir: Path) -> bool:
     return True
 
 
-def higgs_150K(dataset_dir: Path) -> bool:
-    """
-    Higgs dataset from UCI machine learning repository
-    https://archive.ics.uci.edu/ml/datasets/HIGGS
-
-    Classification task. n_classes = 2.
-    higgs_150K X train dataset (100000, 28)
-    higgs_150K y train dataset (50000, 1)
-    higgs_150K X test dataset  (100000,  28)
-    higgs_150K y test dataset  (50000,  1)
-    """
-    dataset_name = 'higgs_150K'
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
-    local_url = os.path.join(dataset_dir, os.path.basename(url))
-    if not os.path.isfile(local_url):
-        logging.info(f'Started loading {dataset_name}')
-        retrieve(url, local_url)
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-
-    nrows_train, nrows_test, dtype = 100000, 50000, np.float32
-    data: Any = pd.read_csv(local_url, delimiter=",", header=None,
-                            compression="gzip", dtype=dtype,
-                            nrows=nrows_train + nrows_test)
-
-    X = data[data.columns[1:]]
-    y =  data[data.columns[0:1]]
-
-    x_train, x_test, y_train, y_test = train_test_split(
-        X, y, train_size=nrows_train, test_size=nrows_test, shuffle=False)
-
-    for data, name in zip((x_train, x_test, y_train, y_test),
-                          ('x_train', 'x_test', 'y_train', 'y_test')):
-        filename = f'{dataset_name}_{name}.npy'
-        np.save(os.path.join(dataset_dir, filename), data)
-    logging.info(f'dataset {dataset_name} is ready.')
-    return True
-
-
 def ijcnn(dataset_dir: Path) -> bool:
     """
     Author: Danil Prokhorov.
@@ -830,28 +611,6 @@ def ijcnn(dataset_dir: Path) -> bool:
     logging.info(f'dataset {dataset_name} is ready.')
     return True
 
-def imb_drama(dataset_dir: Path) -> bool:
-    """
-    imdb_drama dataset from OpenML Datasets (
-    https://www.openml.org/d/273)
-
-    Classification task.
-    Number of features:  1001
-    Number of instances: 120919
-    """
-    dataset_name = 'imb_drama'
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    x_train, y_train = fetch_openml('IMDB.drama', return_X_y=True,
-                           as_frame=False, data_home=dataset_dir)
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-    for data, name in zip((x_train.todense(), y_train),
-                          ('x_train', 'y_train')):
-        filename = f'{dataset_name}_{name}.npy'
-        np.save(os.path.join(dataset_dir, filename), data)
-    logging.info(f'dataset {dataset_name} is ready.')
-    return True
-
 
 def klaverjas(dataset_dir: Path) -> bool:
     """
@@ -967,49 +726,3 @@ def susy(dataset_dir: Path) -> bool:
         np.save(os.path.join(dataset_dir, filename), data)
     logging.info(f'dataset {dataset_name} is ready.')
     return True
-
-
-def cifar(dataset_dir: Path) -> bool:
-    """
-    Cifar dataset from LIBSVM Datasets (
-    https://www.cs.toronto.edu/~kriz/cifar.html#cifar)
-    TaskType: Classification
-    cifar x train dataset (50000, 3072)
-    cifar y train dataset (50000, 1)
-    cifar x test dataset (10000, 3072)
-    cifar y test dataset (10000, 1)
-    """
-    dataset_name = 'cifar'
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2'
-    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2'
-    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
-    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
-
-    if not os.path.isfile(local_url_train):
-        logging.info(f'Started loading {dataset_name}, train')
-        retrieve(url_train, local_url_train)
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-    x_train, y_train = load_svmlight_file(local_url_train,
-                                        dtype=np.float32)
-
-    if not os.path.isfile(local_url_test):
-        logging.info(f'Started loading {dataset_name}, test')
-        retrieve(url_test, local_url_test)
-    logging.info(f'{dataset_name} is loaded, started parsing...')
-    x_test, y_test = load_svmlight_file(local_url_test,
-                                        dtype=np.float32)
-
-    x_train = x_train.toarray()
-    y_train[y_train <= 0] = 0
-
-    x_test = x_test.toarray()
-    y_test[y_test <= 0] = 0
-
-    for data, name in zip((x_train, x_test, y_train, y_test),
-                          ('x_train', 'x_test', 'y_train', 'y_test')):
-        filename = f'{dataset_name}_{name}.npy'
-        np.save(os.path.join(dataset_dir, filename), data)
-    return True
-

From 669f82e3372c2dc9d40d157e8d745531981b0776 Mon Sep 17 00:00:00 2001
From: dmitrii-kriukov <dmitrii.kriukov@intel.com>
Date: Wed, 19 Jan 2022 12:02:00 +0300
Subject: [PATCH 5/7] revert changing skl config

---
 configs/skl_config.json | 71 ++---------------------------------------
 1 file changed, 3 insertions(+), 68 deletions(-)

diff --git a/configs/skl_config.json b/configs/skl_config.json
index 486177949..f3f1fa93f 100644
--- a/configs/skl_config.json
+++ b/configs/skl_config.json
@@ -19,7 +19,6 @@
                     }
                 }
             ],
-            "workload-size": "small",
             "time-method": "box_filter",
             "time-limit": 50,
             "n-clusters": 1000,
@@ -39,7 +38,6 @@
                     }
                 }
             ],
-            "workload-size": "small",
             "time-method": "box_filter",
             "time-limit": 50,
             "n-clusters": 5,
@@ -49,7 +47,6 @@
         },
         {
             "algorithm": "kmeans",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -69,7 +66,6 @@
         },
         {
             "algorithm": "pca",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -117,7 +113,6 @@
         {
             "algorithm": "df_clsf",
             "dtype": "float32",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -155,7 +150,6 @@
         },
         {
             "algorithm": "df_regr",
-            "workload-size": "small",
             "dtype": "float32",
             "dataset": [
                 {
@@ -190,7 +184,6 @@
         },
         {
             "algorithm": "ridge",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -213,7 +206,6 @@
         },
         {
             "algorithm": "linear",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -235,7 +227,6 @@
         },
         {
             "algorithm": "log_reg",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -279,7 +270,6 @@
         },
         {
             "algorithm": "svm",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -301,7 +291,6 @@
         },
         {
             "algorithm": "svm",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -323,7 +312,6 @@
         },
         {
             "algorithm": "svm",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -345,7 +333,6 @@
         },
         {
             "algorithm": "svm",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -367,7 +354,6 @@
         },
         {
             "algorithm": "svm",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -389,7 +375,6 @@
         },
         {
             "algorithm": "svm",
-            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",
@@ -411,7 +396,6 @@
         },
         {
             "algorithm": "nusvc",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -433,7 +417,6 @@
         },
         {
             "algorithm": "nusvc",
-            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",
@@ -455,7 +438,6 @@
         },
         {
             "algorithm": "nusvc",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -477,7 +459,6 @@
         },
         {
             "algorithm": "nusvc",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -499,7 +480,6 @@
         },
         {
             "algorithm": "svr",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -521,7 +501,6 @@
         },
         {
             "algorithm": "svr",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -543,7 +522,6 @@
         },
         {
             "algorithm": "nusvr",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -566,7 +544,6 @@
         },
         {
             "algorithm": "nusvr",
-            "workload-size": "medium",
             "dataset": [
                 {
                     "source": "npy",
@@ -590,7 +567,6 @@
         },
         {
             "algorithm": "nusvr",
-            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",
@@ -613,7 +589,6 @@
         },
         {
             "algorithm": "dbscan",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -632,13 +607,7 @@
                     "training": {
                         "n_samples": 500000
                     }
-                }
-            ]
-        },
-        {
-            "algorithm": "dbscan",
-            "workload-size": "medium",
-            "dataset": [
+                },
                 {
                     "source": "synthetic",
                     "type": "blobs",
@@ -652,7 +621,6 @@
         },
         {
             "algorithm": "knn_clsf",
-            "workload-size": "small",
             "dtype": "float32",
             "dataset": [
                 {
@@ -690,35 +658,7 @@
                     "testing": {
                         "n_samples": 20000
                     }
-                }
-            ],
-            "method": ["brute", "kd_tree"]
-        },
-        {
-            "algorithm": "knn_clsf",
-            "workload-size": "small",
-            "dtype": "float32",
-            "dataset": [
-                {
-                    "source": "synthetic",
-                    "type": "classification",
-                    "n_classes": 10,
-                    "n_features": 16,
-                    "training": {
-                        "n_samples": 250000
-                    },
-                    "testing": {
-                        "n_samples": 250000
-                    }
-                }
-            ],
-            "method": "kd_tree"
-        },
-        {
-            "algorithm": "knn_clsf",
-            "workload-size": "medium",
-            "dtype": "float32",
-            "dataset": [
+                },
                 {
                     "source": "synthetic",
                     "type": "classification",
@@ -732,11 +672,10 @@
                     }
                 }
             ],
-            "method": "brute"
+            "method": ["brute", "kd_tree"]
         },
         {
             "algorithm": "train_test_split",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "synthetic",
@@ -772,7 +711,6 @@
         },
         {
             "algorithm": "train_test_split",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -792,7 +730,6 @@
         },
         {
             "algorithm": "lasso",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source":   "npy",
@@ -809,7 +746,6 @@
         },
         {
             "algorithm": "elasticnet",
-            "workload-size": "small",
             "dataset": [
                 {
                     "source": "npy",
@@ -832,7 +768,6 @@
         },
         {
             "algorithm": "tsne",
-            "workload-size": "large",
             "dataset": [
                 {
                     "source": "npy",

From 08a66ee845aa03b6b951fd9d182074e0ec35ca9d Mon Sep 17 00:00:00 2001
From: dmitrii-kriukov <dmitrii.kriukov@intel.com>
Date: Wed, 19 Jan 2022 12:11:21 +0300
Subject: [PATCH 6/7] sizes for df_clsf

---
 configs/xpu/df_clsf.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/configs/xpu/df_clsf.json b/configs/xpu/df_clsf.json
index 4d14763b8..0c7c25d70 100644
--- a/configs/xpu/df_clsf.json
+++ b/configs/xpu/df_clsf.json
@@ -26,6 +26,7 @@
                     }
                 }
             ],
+            "workload-size": "medium",
             "num-trees": 10,
             "max-depth": 5
         },
@@ -46,6 +47,7 @@
                     }
                 }
             ],
+            "workload-size": "large",
             "num-trees": 100,
             "max-depth": 8
         },
@@ -66,6 +68,7 @@
                     }
                 }
             ],
+            "workload-size": "medium",
             "num-trees": 20,
             "max-depth": 16
         },
@@ -86,6 +89,7 @@
                     }
                 }
             ],
+            "workload-size": "large",
             "num-trees": 100,
             "max-depth": 10
         },
@@ -106,6 +110,7 @@
                     }
                 }
             ],
+            "workload-size": "medium",
             "num-trees": 50,
             "max-depth": 15
         }

From caf58888915ff527d3b736c39678c345d7618169 Mon Sep 17 00:00:00 2001
From: dmitrii-kriukov <dmitrii.kriukov@intel.com>
Date: Wed, 19 Jan 2022 14:41:03 +0300
Subject: [PATCH 7/7] update help info

---
 runner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/runner.py b/runner.py
index 130b3c01d..f030a76b5 100755
--- a/runner.py
+++ b/runner.py
@@ -60,7 +60,9 @@ def get_configs(path: Path) -> List[str]:
                         'make sure to add the dtype parameter to the config file ')
     parser.add_argument('--workload-size', type=str, default="small medium large", nargs='+',
                         choices=("small", "medium", "large"),
-                        help='Available dataset sizes')
+                        help='Available workload sizes,'
+                        'make sure to add the workload-size parameter to the config file '
+                        'unmarked workloads will be launched anyway')
     parser.add_argument('--no-intel-optimized', default=False, action='store_true',
                         help='Use Scikit-learn without Intel optimizations')
     parser.add_argument('--output-file', default='results.json',