Configs for xpu: dbscan, kmeans, pca (#105)

dmitrii-kriukov · web-flow · commit aa4705e4eec0 · 2022-01-27T16:36:20.000+03:00
* add new cases &amp; fix

* susy, disable float32, fixes

* knn_svm

* fix susy_cluster

* fix datasets

* add cifar

* fix road network

* epsilon dataset uses only train

* fix epsilon

* delete trash

* pep8

* cifar binary

* disable imb_drama

* C-style format

* C-style matrix

* Latin C

* fix warning with higgs

* revert

* cifar binary

* indent

* pandas version
diff --git a/configs/xpu/dbscan.json b/configs/xpu/dbscan.json
@@ -0,0 +1,54 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "dbscan",
+        "data-format": "pandas",
+        "data-order": "F",
+        "dtype": ["float64"],
+        "device": ["host", "cpu", "gpu", "none"]
+    },
+    "cases": [
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "hepmass_10K_cluster",
+                    "training":
+                    {
+                        "x": "data/hepmass_10K_cluster.npy"
+                    }
+                }
+            ],
+            "eps": 5,
+            "min-samples": 3
+        },
+        {
+            "dataset": [
+                 {
+                    "source": "npy",
+                    "name": "mnist_10K_cluster",
+                    "training":
+                    {
+                        "x": "data/mnist_10K_cluster.npy"
+                    }
+                }
+            ],
+            "eps": 1.7e3,
+            "min-samples": 3
+        },       
+        {
+            "dataset": [
+                 {
+                    "source": "npy",
+                    "name": "road_network_20K_cluster",
+                    "training":
+                    {
+                        "x": "data/road_network_20K_cluster.npy"
+                    }
+                }
+            ],
+            "eps": 1.0e3,
+            "min-samples": 220
+        }
+    ]
+}
diff --git a/configs/xpu/kmeans.json b/configs/xpu/kmeans.json
@@ -0,0 +1,124 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "kmeans",
+        "data-format": "pandas",
+        "data-order": "C",
+        "dtype": ["float64"],
+        "device": ["host", "cpu", "gpu", "none"]
+    },
+    "cases": [
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs_one_m_clustering",
+                    "training":
+                    {
+                        "x": "data/higgs_one_m_clustering.npy"
+                    }
+                }
+            ],
+            "n-clusters": 10,
+            "maxiter": 100
+        },
+        {
+            "dataset": [
+               {
+                   "source": "npy",
+                   "name": "higgs_one_m_clustering",
+                   "training":
+                   {
+                       "x": "data/higgs_one_m_clustering.npy"
+                   }
+               }
+           ],
+           "n-clusters": [100, 250],
+           "maxiter": 10
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "epsilon_50K_cluster",
+                    "training":
+                    {
+                        "x": "data/epsilon_50K_cluster.npy"
+                    }
+                }
+            ],
+            "n-clusters": [512, 1024, 2048],
+            "maxiter": 10
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "hepmass_1M_cluster",
+                    "training":
+                    {
+                        "x": "data/hepmass_1M_cluster.npy"
+                    }
+                }
+            ],
+            "n-clusters": [100, 250],
+            "maxiter": 10
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "hepmass_1M_cluster",
+                    "training":
+                    {
+                        "x": "data/hepmass_1M_cluster.npy"
+                    }
+                }
+            ],
+            "n-clusters": 10,
+            "maxiter": 100
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "susy_cluster",
+                    "training":
+                    {
+                        "x": "data/susy_cluster.npy"
+                    }
+                }
+            ],
+            "n-clusters": 10,
+            "maxiter": 100
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "susy_cluster",
+                    "training":
+                    {
+                        "x": "data/susy_cluster.npy"
+                    }
+                }
+            ],
+            "n-clusters": [100 , 250],
+            "maxiter": 10
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "cifar_cluster",
+                    "training":
+                    {
+                        "x": "data/cifar_cluster.npy"
+                    }
+                }
+            ],
+            "n-clusters": [512, 1024, 2048],
+            "maxiter": 10
+        }
+    ]
+}
diff --git a/configs/xpu/pca.json b/configs/xpu/pca.json
@@ -0,0 +1,52 @@
+{
+    "common": {
+        "lib": "sklearn",
+        "algorithm": "pca",
+        "data-format": "pandas",
+        "data-order": "F",
+        "dtype": ["float64"],
+        "device": ["host", "cpu", "none"]
+    },
+    "cases": [
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "data/higgs1m_x_train.npy",
+                        "y": "data/higgs1m_y_train.npy"
+                    }
+                }
+            ]
+        },
+        {
+            "device": ["host", "cpu", "gpu", "none"],
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "epsilon_30K",
+                    "training":
+                    {
+                        "x": "data/epsilon_30K_x_train.npy",
+                        "y": "data/epsilon_30K_y_train.npy"
+                    }
+                }
+            ]
+        },
+        {
+            "dataset": [
+                {
+                    "source": "npy",
+                    "name": "cifar_binary",
+                    "training":
+                    {
+                        "x": "data/cifar_binary_x_train.npy",
+                        "y": "data/cifar_binary_y_train.npy"
+                    }
+                }
+            ]
+        }
+    ]
+}
diff --git a/datasets/load_datasets.py b/datasets/load_datasets.py
@@ -22,14 +22,18 @@
 from typing import Callable, Dict
 
 from .loader_classification import (a_nine_a, airline, airline_ohe, bosch,
-                                    census, codrnanorm, creditcard, epsilon, fraud,
-                                    gisette, hepmass_150K, higgs, higgs_one_m, ijcnn,
-                                    klaverjas, santander, skin_segmentation, susy)
+                                    census, codrnanorm, cifar_binary, creditcard, epsilon,
+                                    epsilon_30K, fraud, gisette, hepmass_150K, higgs,
+                                    higgs_one_m, ijcnn, klaverjas, santander, skin_segmentation,
+                                    susy)
 from .loader_multiclass import (connect, covertype, covtype, letters, mlsr,
                                 mnist, msrank, plasticc, sensit)
 from .loader_regression import (abalone, california_housing, fried, higgs_10500K,
                                 medical_charges_nominal, mortgage_first_q,
                                 twodplanes, year_prediction_msd, yolanda, airline_regression)
+from .loader_clustering import (cifar_cluster, epsilon_50K_cluster, higgs_one_m_clustering,
+                                hepmass_1M_cluster, hepmass_10K_cluster, mnist_10K_cluster,
+                                road_network_20K_cluster, susy_cluster)
 
 dataset_loaders: Dict[str, Callable[[Path], bool]] = {
     "a9a": a_nine_a,
@@ -40,31 +44,41 @@
     "bosch": bosch,
     "california_housing": california_housing,
     "census": census,
+    "cifar_binary": cifar_binary,
+    "cifar_cluster": cifar_cluster,
     "codrnanorm": codrnanorm,
     "connect": connect,
     "covertype": covertype,
     "covtype": covtype,
     "creditcard": creditcard,
     "epsilon": epsilon,
+    "epsilon_30K": epsilon_30K,
+    "epsilon_50K_cluster": epsilon_50K_cluster,
     "fraud": fraud,
     "fried": fried,
     "gisette": gisette,
     "hepmass_150K": hepmass_150K,
+    "hepmass_1M_cluster": hepmass_1M_cluster,
+    "hepmass_10K_cluster": hepmass_10K_cluster,
     "higgs": higgs,
     "higgs1m": higgs_one_m,
     "higgs_10500K": higgs_10500K,
+    "higgs_one_m_clustering": higgs_one_m_clustering,
     "ijcnn": ijcnn,
     "klaverjas": klaverjas,
     "letters": letters,
     "mlsr": mlsr,
     "medical_charges_nominal": medical_charges_nominal,
     "mnist": mnist,
+    "mnist_10K_cluster": mnist_10K_cluster,
     "mortgage1Q": mortgage_first_q,
     "msrank": msrank,
     "plasticc": plasticc,
+    "road_network_20K_cluster": road_network_20K_cluster,
     "santander": santander,
     "sensit": sensit,
     "skin_segmentation": skin_segmentation,
+    "susy_cluster": susy_cluster,
     "susy": susy,
     "twodplanes": twodplanes,
     "year_prediction_msd": year_prediction_msd,
diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py
@@ -334,6 +334,40 @@ def epsilon(dataset_dir: Path) -> bool:
     return True
 
 
+def epsilon_30K(dataset_dir: Path) -> bool:
+    """
+    Epsilon dataset
+    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
+
+    Classification task. n_classes = 2.
+    epsilon_30K x train dataset (30000, 2000)
+    epsilon_30K y train dataset (30000, 2000)
+    """
+    dataset_name = 'epsilon_30K'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
+                '/epsilon_normalized.bz2'
+    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
+
+    num_train, dtype = 30000, np.float32
+    if not os.path.isfile(local_url_train):
+        logging.info(f'Started loading {dataset_name}, train')
+        retrieve(url_train, local_url_train)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    X_train, y_train = load_svmlight_file(local_url_train,
+                                          dtype=dtype)
+    X_train = X_train.toarray()[:num_train]
+    y_train = y_train[:num_train]
+
+    for data, name in zip((X_train, y_train),
+                          ('x_train', 'y_train')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    logging.info(f'dataset {dataset_name} is ready.')
+    return True
+
+
 def fraud(dataset_dir: Path) -> bool:
     """
     Credit Card Fraud Detection contest
@@ -688,6 +722,51 @@ def skin_segmentation(dataset_dir: Path) -> bool:
     return True
 
 
+def cifar_binary(dataset_dir: Path) -> bool:
+    """
+    Cifar dataset from LIBSVM Datasets (
+    https://www.cs.toronto.edu/~kriz/cifar.html#cifar)
+    TaskType: Classification
+    cifar_binary x train dataset (50000, 3072)
+    cifar_binary y train dataset (50000, 1)
+    cifar_binary x test dataset (10000, 3072)
+    cifar_binary y test dataset (10000, 1)
+    """
+    dataset_name = 'cifar_binary'
+    os.makedirs(dataset_dir, exist_ok=True)
+
+    url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2'
+    url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2'
+    local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
+    local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
+
+    if not os.path.isfile(local_url_train):
+        logging.info(f'Started loading {dataset_name}, train')
+        retrieve(url_train, local_url_train)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    x_train, y_train = load_svmlight_file(local_url_train,
+                                          dtype=np.float32)
+
+    if not os.path.isfile(local_url_test):
+        logging.info(f'Started loading {dataset_name}, test')
+        retrieve(url_test, local_url_test)
+    logging.info(f'{dataset_name} is loaded, started parsing...')
+    x_test, y_test = load_svmlight_file(local_url_test,
+                                        dtype=np.float32)
+
+    x_train = x_train.toarray()
+    y_train = (y_train > 0).astype(int)
+
+    x_test = x_test.toarray()
+    y_test = (y_test > 0).astype(int)
+
+    for data, name in zip((x_train, x_test, y_train, y_test),
+                          ('x_train', 'x_test', 'y_train', 'y_test')):
+        filename = f'{dataset_name}_{name}.npy'
+        np.save(os.path.join(dataset_dir, filename), data)
+    return True
+
+
 def susy(dataset_dir: Path) -> bool:
     """
     SUSY dataset from UCI machine learning repository (
diff --git a/datasets/loader_clustering.py b/datasets/loader_clustering.py
diff --git a/xgboost_bench/requirements.txt b/xgboost_bench/requirements.txt