Update xgboost configs;

Alexsandruss · Alexsandruss · commit c44943ca1ce3 · 2024-05-21T10:00:24.000-07:00
Update conda envs;
Fix for data loaders
diff --git a/configs/common/xgboost.json b/configs/common/xgboost.json
@@ -5,7 +5,7 @@
                 "algorithm": {
                     "device": "cpu",
                     "estimator_params": { "tree_method": "hist" },
-                    "enable_modelbuilders": [true, false]
+                    "enable_modelbuilders": false
                 }
             },
             {
diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json
@@ -65,8 +65,8 @@
                 "data": {
                     "dataset": "epsilon",
                     "split_kwargs": {
-                        "train_size": 4000,
-                        "test_size": 80000
+                        "train_size": 10000,
+                        "test_size": 100000
                     }
                 },
                 "algorithm": {
@@ -77,12 +77,34 @@
                         "n_estimators": 200
                     }
                 }
+            },
+            {
+                "data": {
+                    "dataset": "gisette",
+                    "split_kwargs": {
+                        "train_size": 2000,
+                        "test_size": 5000
+                    }
+                },
+                "algorithm": {
+                    "estimator_params": {
+                        "learning_rate": 0.15,
+                        "max_leaves": 256,
+                        "colsample_bytree": 0.1,
+                        "colsample_bynode": 0.1,
+                        "n_estimators": 100
+                    }
+                }
             }
         ]
 	},
 	"TEMPLATES": {
         "binary classification": {
-			"SETS": ["xgboost binary classification", "xgboost implementations", "binary classification data"]
+			"SETS": [
+                "xgboost binary classification",
+                "xgboost implementations",
+                "binary classification data"
+            ]
 		}
 	}
 }
diff --git a/configs/regular/xgboost_multi.json b/configs/regular/xgboost_multi.json
@@ -63,7 +63,11 @@
 	},
 	"TEMPLATES": {
         "multi classification": {
-			"SETS": ["xgboost multiclassification", "xgboost implementations", "multiclassification data"]
+			"SETS": [
+                "xgboost multiclassification",
+                "xgboost implementations",
+                "multiclassification data"
+            ]
 		}
 	}
 }
diff --git a/configs/regular/xgboost_regression.json b/configs/regular/xgboost_regression.json
@@ -71,12 +71,34 @@
                         "n_estimators": 500
                     }
                 }
+            },
+            {
+                "data": {
+                    "dataset": "gisette",
+                    "split_kwargs": {
+                        "train_size": 2000,
+                        "test_size": 5000
+                    }
+                },
+                "algorithm": {
+                    "estimator_params": {
+                        "learning_rate": 0.15,
+                        "max_leaves": 256,
+                        "colsample_bytree": 0.1,
+                        "colsample_bynode": 0.1,
+                        "n_estimators": 100
+                    }
+                }
             }
         ]
 	},
 	"TEMPLATES": {
 		"regression": {
-			"SETS": ["xgboost regression", "xgboost implementations", "regression data"]
+			"SETS": [
+                "xgboost regression",
+                "xgboost implementations",
+                "regression data"
+            ]
 		}
 	}
 }
diff --git a/envs/conda-env-rapids.yml b/envs/conda-env-rapids.yml
@@ -6,10 +6,16 @@ dependencies:
   - python=3.10
   - rapids
   - cudatoolkit
+  # blas and openmp
+  - libblas=*=*mkl
+  - _openmp_mutex=*=*llvm
   # sklbench dependencies
+  - scikit-learn
   - pandas
   - tabulate
   - fastparquet
+  - h5py
+  - kaggle
   - openpyxl
   - tqdm
   - psutil
diff --git a/envs/conda-env-sklearn.yml b/envs/conda-env-sklearn.yml
@@ -11,10 +11,16 @@ dependencies:
   - faiss-cpu
   - intel::scikit-learn-intelex
   - intel::daal4py
+  # blas and openmp
+  - libblas=*=*mkl
+  - _openmp_mutex=*=*llvm
   # sklbench dependencies
+  - scikit-learn
   - pandas
   - tabulate
   - fastparquet
+  - h5py
+  - kaggle
   - openpyxl
   - tqdm
   - psutil
diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py
@@ -361,7 +361,7 @@ def load_epsilon(
     x_test, y_test = load_svmlight_file(local_url_test, dtype=np.float32)
 
     x = sparse.vstack([x_train, x_test])
-    y = np.vstack([y_train, y_test])
+    y = np.hstack([y_train, y_test])
     y[y <= 0] = 0
 
     data_desc = {
@@ -423,7 +423,7 @@ def convert_y(y, n_samples):
     y_test = convert_y(data["y_test"], test_size)
 
     x = np.vstack([x_train, x_test])
-    y = np.vstack([y_train, y_test])
+    y = np.hstack([y_train, y_test])
 
     data_desc = {
         "n_classes": 2,

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`"algorithm": {`
`6`	`6`	`"device": "cpu",`
`7`	`7`	`"estimator_params": { "tree_method": "hist" },`
`8`		`- "enable_modelbuilders": [true, false]`
	`8`	`+ "enable_modelbuilders": false`
`9`	`9`	`}`
`10`	`10`	`},`
`11`	`11`	`{`
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,11 @@`
`63`	`63`	`},`
`64`	`64`	`"TEMPLATES": {`
`65`	`65`	`"multi classification": {`
`66`		`- "SETS": ["xgboost multiclassification", "xgboost implementations", "multiclassification data"]`
	`66`	`+ "SETS": [`
	`67`	`+ "xgboost multiclassification",`
	`68`	`+ "xgboost implementations",`
	`69`	`+ "multiclassification data"`
	`70`	`+ ]`
`67`	`71`	`}`
`68`	`72`	`}`
`69`	`73`	`}`
Original file line number	Diff line number	Diff line change
`@@ -71,12 +71,34 @@`
`71`	`71`	`"n_estimators": 500`
`72`	`72`	`}`
`73`	`73`	`}`
	`74`	`+ },`
	`75`	`+ {`
	`76`	`+ "data": {`
	`77`	`+ "dataset": "gisette",`
	`78`	`+ "split_kwargs": {`
	`79`	`+ "train_size": 2000,`
	`80`	`+ "test_size": 5000`
	`81`	`+ }`
	`82`	`+ },`
	`83`	`+ "algorithm": {`
	`84`	`+ "estimator_params": {`
	`85`	`+ "learning_rate": 0.15,`
	`86`	`+ "max_leaves": 256,`
	`87`	`+ "colsample_bytree": 0.1,`
	`88`	`+ "colsample_bynode": 0.1,`
	`89`	`+ "n_estimators": 100`
	`90`	`+ }`
	`91`	`+ }`
`74`	`92`	`}`
`75`	`93`	`]`
`76`	`94`	`},`
`77`	`95`	`"TEMPLATES": {`
`78`	`96`	`"regression": {`
`79`		`- "SETS": ["xgboost regression", "xgboost implementations", "regression data"]`
	`97`	`+ "SETS": [`
	`98`	`+ "xgboost regression",`
	`99`	`+ "xgboost implementations",`
	`100`	`+ "regression data"`
	`101`	`+ ]`
`80`	`102`	`}`
`81`	`103`	`}`
`82`	`104`	`}`