Added links and descriptions for new datasets

Igor Rukhovich · Igor Rukhovich · commit 6e4742363777 · 2021-04-15T09:42:57.000+03:00
diff --git a/configs/xgb_cpu_config.json b/configs/xgb_cpu_config.json
@@ -173,11 +173,6 @@
                     {
                         "x":    "data/plasticc_x_train.csv",
                         "y":    "data/plasticc_y_train.csv"
-                    },
-                    "testing":
-                    {
-                        "x":    "data/plasticc_x_test.csv",
-                        "y":    "data/plasticc_y_test.csv"
                     }
                 }
             ],
diff --git a/configs/xgb_gpu_config.json b/configs/xgb_gpu_config.json
@@ -77,18 +77,18 @@
                     }
                 }
             ],
-            "reg-alpha":                                0.9,
-            "max-bin":                                  256,
-            "scale-pos-weight":                         2,
-            "learning-rate":                            0.1,
-            "subsample":                                1,
-            "reg-lambda":                               1,
-            "min-child-weight":                         0,
-            "max-depth":                                8,
-            "max-leaves":                               256,
-            "n-estimators":                             1000,
-            "objective":                                "binary:logistic",
-            "inplace-predict":                          ""
+            "reg-alpha":        0.9,
+            "max-bin":          256,
+            "scale-pos-weight": 2,
+            "learning-rate":    0.1,
+            "subsample":        1,
+            "reg-lambda":       1,
+            "min-child-weight": 0,
+            "max-depth":        8,
+            "max-leaves":       256,
+            "n-estimators":     1000,
+            "objective":        "binary:logistic",
+            "inplace-predict":  ""
         },
         {
             "dataset": [
@@ -107,10 +107,10 @@
                     }
                 }
             ],
-            "learning-rate":    0.03,
-            "max-depth":        6,
-            "n-estimators":     1000,
-            "objective":        "multi:softprob"
+            "learning-rate":0.03,
+            "max-depth":    6,
+            "n-estimators": 1000,
+            "objective":    "multi:softprob"
         },
         {
             "dataset": [
@@ -152,15 +152,15 @@
                     }
                 }
             ],
-            "max-bin":                      256,
-            "learning-rate":                0.3,
-            "subsample":                    1,
-            "reg-lambda":                   2,
-            "min-child-weight":             1,
-            "min-split-loss":               0.1,
-            "max-depth":                    8,
-            "n-estimators":                 200,
-            "objective":                    "multi:softprob"
+            "max-bin":          256,
+            "learning-rate":    0.3,
+            "subsample":        1,
+            "reg-lambda":       2,
+            "min-child-weight": 1,
+            "min-split-loss":   0.1,
+            "max-depth":        8,
+            "n-estimators":     200,
+            "objective":        "multi:softprob"
         },
         {
             "dataset": [
@@ -171,11 +171,6 @@
                     {
                         "x":    "data/plasticc_x_train.csv",
                         "y":    "data/plasticc_y_train.csv"
-                    },
-                    "testing":
-                    {
-                        "x":    "data/plasticc_x_test.csv",
-                        "y":    "data/plasticc_y_test.csv"
                     }
                 }
             ],
@@ -197,12 +192,12 @@
                     }
                 }
             ],
-            "n-estimators":                 10000,
-            "objective":                    "binary:logistic",
-            "max-depth":                    1,
-            "subsample":                    0.5,
-            "eta":                          0.1,
-            "colsample-bytree":             0.05
+            "n-estimators":     10000,
+            "objective":        "binary:logistic",
+            "max-depth":        1,
+            "subsample":        0.5,
+            "eta":              0.1,
+            "colsample-bytree": 0.05
         }
     ]
 }
diff --git a/datasets/loader_clf.py b/datasets/loader_clf.py
@@ -32,7 +32,7 @@ def a_nine_a(dataset_dir: Path) -> bool:
     Author: Ronny Kohavi","Barry Becker
     libSVM","AAD group
     Source: original - Date unknown
-    Cite: http://archive.ics.uci.edu/ml/datasets/Adult
+    Site: http://archive.ics.uci.edu/ml/datasets/Adult
 
     Classification task. n_classes = 2.
     a9a X train dataset (39073, 123)
@@ -65,6 +65,14 @@ def a_nine_a(dataset_dir: Path) -> bool:
 
 
 def airline(dataset_dir: Path) -> bool:
+    """
+    Airline dataset
+    http://kt.ijs.si/elena_ikonomovska/data.html
+
+    TaskType:binclass
+    NumberOfFeatures:13
+    NumberOfInstances:115M
+    """
     dataset_name = 'airline'
     os.makedirs(dataset_dir, exist_ok=True)
 
@@ -131,10 +139,10 @@ def airline_ohe(dataset_dir: Path) -> bool:
     local_url_train = os.path.join(dataset_dir, os.path.basename(url_train))
     local_url_test = os.path.join(dataset_dir, os.path.basename(url_test))
     if not os.path.isfile(local_url_train):
-        logging.info(f'Started loading {dataset_name}')
+        logging.info(f'Started loading {dataset_name} train')
         retrieve(url_train, local_url_train)
     if not os.path.isfile(local_url_test):
-        logging.info(f'Started loading {dataset_name}')
+        logging.info(f'Started loading {dataset_name} test')
         retrieve(url_test, local_url_test)
     logging.info(f'{dataset_name} is loaded, started parsing...')
 
@@ -170,6 +178,17 @@ def airline_ohe(dataset_dir: Path) -> bool:
 
 
 def bosch(dataset_dir: Path) -> bool:
+    """
+    Bosch Production Line Performance data set
+    https://www.kaggle.com/c/bosch-production-line-performance
+
+    Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api)
+    Contains missing values as NaN.
+
+    TaskType:binclass
+    NumberOfFeatures:968
+    NumberOfInstances:1.184M
+    """
     dataset_name = 'bosch'
     os.makedirs(dataset_dir, exist_ok=True)
 
@@ -233,6 +252,14 @@ def codrnanorm(dataset_dir: Path) -> bool:
 
 
 def epsilon(dataset_dir: Path) -> bool:
+    """
+    Epsilon dataset
+    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
+
+    TaskType:binclass
+    NumberOfFeatures:2000
+    NumberOfInstances:500K
+    """
     dataset_name = 'epsilon'
     os.makedirs(dataset_dir, exist_ok=True)
 
@@ -268,6 +295,17 @@ def epsilon(dataset_dir: Path) -> bool:
 
 
 def fraud(dataset_dir: Path) -> bool:
+    """
+    Credit Card Fraud Detection contest
+    https://www.kaggle.com/mlg-ulb/creditcardfraud
+
+    Requires Kaggle API and API token (https://github.com/Kaggle/kaggle-api)
+    Contains missing values as NaN.
+
+    TaskType:binclass
+    NumberOfFeatures:30
+    NumberOfInstances:285K
+    """
     dataset_name = 'fraud'
     os.makedirs(dataset_dir, exist_ok=True)
 
@@ -371,6 +409,14 @@ def gisette(dataset_dir: Path) -> bool:
 
 
 def higgs(dataset_dir: Path) -> bool:
+    """
+    Higgs dataset from UCI machine learning repository
+    https://archive.ics.uci.edu/ml/datasets/HIGGS
+
+    TaskType:binclass
+    NumberOfFeatures:28
+    NumberOfInstances:11M
+    """
     dataset_name = 'higgs'
     os.makedirs(dataset_dir, exist_ok=True)
 
@@ -397,11 +443,14 @@ def higgs(dataset_dir: Path) -> bool:
 
 def higgs_one_m(dataset_dir: Path) -> bool:
     """
-    Higgs dataset from UCI machine learning repository (
-    https://archive.ics.uci.edu/ml/datasets/HIGGS).
+    Higgs dataset from UCI machine learning repository
+    https://archive.ics.uci.edu/ml/datasets/HIGGS
+
+    Only first 1.5M samples is taken
+
     TaskType:binclass
     NumberOfFeatures:28
-    NumberOfInstances:11M
+    NumberOfInstances:1.5M
     """
     dataset_name = 'higgs1m'
     os.makedirs(dataset_dir, exist_ok=True)
@@ -511,6 +560,9 @@ def klaverjas(dataset_dir: Path) -> bool:
 
 
 def santander(dataset_dir: Path) -> bool:
+    """
+    Still doesn't have an loading instruction
+    """
     return False
 
 
diff --git a/datasets/loader_mul.py b/datasets/loader_mul.py
@@ -98,6 +98,15 @@ def covertype(dataset_dir: Path) -> bool:
 
 
 def covtype(dataset_dir: Path) -> bool:
+    """
+    Cover type dataset from UCI machine learning repository
+    https://archive.ics.uci.edu/ml/datasets/covertype
+
+    y contains 7 unique class labels from 1 to 7 inclusive.
+    TaskType:multiclass
+    NumberOfFeatures:54
+    NumberOfInstances:581012
+    """
     dataset_name = 'covtype'
     os.makedirs(dataset_dir, exist_ok=True)
 
@@ -192,6 +201,7 @@ def mnist(dataset_dir: Path) -> bool:
 def msrank(dataset_dir: Path) -> bool:
     """
     Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
+
     TaskType:binclass
     NumberOfFeatures:700
     NumberOfInstances:10100000
@@ -200,21 +210,23 @@ def msrank(dataset_dir: Path) -> bool:
     os.makedirs(dataset_dir, exist_ok=True)
     url = "https://storage.mds.yandex.net/get-devtools-opensource/471749/msrank.tar.gz"
     local_url = os.path.join(dataset_dir, os.path.basename(url))
+    unzipped_url = os.path.join(dataset_dir, "MSRank")
     if not os.path.isfile(local_url):
         logging.info(f'Started loading {dataset_name}')
         retrieve(url, local_url)
+    if not os.path.isdir(unzipped_url):
         logging.info(f'{dataset_name} is loaded, unzipping...')
         tar = tarfile.open(local_url, "r:gz")
         tar.extractall(dataset_dir)
         tar.close()
-        logging.info(f'{dataset_name} is unzipped, started parsing...')
+    logging.info(f'{dataset_name} is unzipped, started parsing...')
 
     sets = []
     labels = []
     n_features = 137
 
     for set_name in ['train.txt', 'vali.txt', 'test.txt']:
-        file_name = str(dataset_dir) + os.path.join('MSRank', set_name)
+        file_name = os.path.join(unzipped_url, set_name)
 
         n_samples = count_lines(file_name)
         with open(file_name, 'r') as file_obj:
@@ -238,6 +250,9 @@ def msrank(dataset_dir: Path) -> bool:
 
 
 def plasticc(dataset_dir: Path) -> bool:
+    """
+    Still doesn't have an loading instruction
+    """
     return False
 
 
diff --git a/datasets/loader_reg.py b/datasets/loader_reg.py
@@ -60,10 +60,21 @@ def abalone(dataset_dir: Path) -> bool:
 
 
 def mortgage_first_q(dataset_dir: Path) -> bool:
+    """
+    Still doesn't have an loading instruction
+    """
     return False
 
 
 def year_prediction_msd(dataset_dir: Path) -> bool:
+    """
+    YearPredictionMSD dataset from UCI repository
+    https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd
+
+    TaskType:regression
+    NumberOfFeatures:90
+    NumberOfInstances:515345
+    """
     dataset_name = 'year_prediction_msd'
     os.makedirs(dataset_dir, exist_ok=True)
 

Original file line number	Diff line number	Diff line change
`@@ -173,11 +173,6 @@`
`173`	`173`	`{`
`174`	`174`	`"x": "data/plasticc_x_train.csv",`
`175`	`175`	`"y": "data/plasticc_y_train.csv"`
`176`		`- },`
`177`		`- "testing":`
`178`		`- {`
`179`		`- "x": "data/plasticc_x_test.csv",`
`180`		`- "y": "data/plasticc_y_test.csv"`
`181`	`176`	`}`
`182`	`177`	`}`
`183`	`178`	`],`