Skip to content

Extend output result & minor fixes #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/blogs/skl_2021_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@
}
],
"nu": [0.25],
"kernel": ["sigmoid"]
"kernel": ["poly"]
},
{
"algorithm": "svr",
Expand Down
41 changes: 26 additions & 15 deletions datasets/loader_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@ def airline(dataset_dir: Path) -> bool:
Airline dataset
http://kt.ijs.si/elena_ikonomovska/data.html

TaskType:binclass
NumberOfFeatures:13
NumberOfInstances:115M
Classification task. n_classes = 2.
airline X train dataset (92055213, 13)
airline y train dataset (92055213, 1)
airline X test dataset (23013804, 13)
airline y test dataset (23013804, 1)
"""
dataset_name = 'airline'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -126,9 +128,12 @@ def airline(dataset_dir: Path) -> bool:
def airline_ohe(dataset_dir: Path) -> bool:
"""
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf
TaskType:binclass
NumberOfFeatures:700
NumberOfInstances:10100000

Classification task. n_classes = 2.
airline-ohe X train dataset (1000000, 692)
airline-ohe y train dataset (1000000, 1)
airline-ohe X test dataset (100000, 692)
airline-ohe y test dataset (100000, 1)
"""
dataset_name = 'airline-ohe'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -289,9 +294,11 @@ def epsilon(dataset_dir: Path) -> bool:
Epsilon dataset
https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html

TaskType:binclass
NumberOfFeatures:2000
NumberOfInstances:500K
Classification task. n_classes = 2.
epsilon X train dataset (400000, 2000)
epsilon y train dataset (400000, 1)
epsilon X test dataset (100000, 2000)
epsilon y test dataset (100000, 1)
"""
dataset_name = 'epsilon'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -444,9 +451,11 @@ def higgs(dataset_dir: Path) -> bool:
Higgs dataset from UCI machine learning repository
https://archive.ics.uci.edu/ml/datasets/HIGGS

TaskType:binclass
NumberOfFeatures:28
NumberOfInstances:11M
Classification task. n_classes = 2.
higgs X train dataset (8799999, 28)
higgs y train dataset (8799999, 1)
higgs X test dataset (2200000, 28)
higgs y test dataset (2200000, 1)
"""
dataset_name = 'higgs'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -479,9 +488,11 @@ def higgs_one_m(dataset_dir: Path) -> bool:

Only first 1.5M samples is taken

TaskType:binclass
NumberOfFeatures:28
NumberOfInstances:1.5M
Classification task. n_classes = 2.
higgs1m X train dataset (1000000, 28)
higgs1m y train dataset (1000000, 1)
higgs1m X test dataset (500000, 28)
higgs1m y test dataset (500000, 1)
"""
dataset_name = 'higgs1m'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down
24 changes: 15 additions & 9 deletions datasets/loader_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,11 @@ def covtype(dataset_dir: Path) -> bool:
https://archive.ics.uci.edu/ml/datasets/covertype

y contains 7 unique class labels from 1 to 7 inclusive.
TaskType:multiclass
NumberOfFeatures:54
NumberOfInstances:581012
Classification task. n_classes = 7.
covtype X train dataset (464809, 54)
covtype y train dataset (464809, 1)
covtype X test dataset (116203, 54)
covtype y test dataset (116203, 1)
"""
dataset_name = 'covtype'
os.makedirs(dataset_dir, exist_ok=True)
Expand All @@ -125,9 +127,11 @@ def letters(dataset_dir: Path) -> bool:
"""
http://archive.ics.uci.edu/ml/datasets/Letter+Recognition

TaskType:multiclass
NumberOfFeatures:16
NumberOfInstances:20.000
Classification task. n_classes = 26.
letters X train dataset (16000, 16)
letters y train dataset (16000, 1)
letters X test dataset (4000, 16)
letters y test dataset (4000, 1)
"""
dataset_name = 'letters'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -204,9 +208,11 @@ def msrank(dataset_dir: Path) -> bool:
"""
Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf

TaskType:multiclass
NumberOfFeatures:137
NumberOfInstances:1.2M
Multiclass classification task
msrank X train dataset (958671, 137)
msrank y train dataset (958671, 1)
msrank X test dataset (241521, 137)
msrank y test dataset (241521, 1)
"""
dataset_name = 'msrank'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down
14 changes: 8 additions & 6 deletions datasets/loader_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ def abalone(dataset_dir: Path) -> bool:
"""
https://archive.ics.uci.edu/ml/machine-learning-databases/abalone

TaskType:regression
NumberOfFeatures:8
NumberOfInstances:4177
abalone x train dataset (3341, 8)
abalone y train dataset (3341, 1)
abalone x test dataset (836, 8)
abalone y train dataset (836, 1)
"""
dataset_name = 'abalone'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down Expand Up @@ -196,9 +197,10 @@ def year_prediction_msd(dataset_dir: Path) -> bool:
YearPredictionMSD dataset from UCI repository
https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd

TaskType:regression
NumberOfFeatures:90
NumberOfInstances:515345
year_prediction_msd x train dataset (463715, 11)
year_prediction_msd y train dataset (463715, 1)
year_prediction_msd x test dataset (51630, 11)
year_prediction_msd y train dataset (51630, 1)
"""
dataset_name = 'year_prediction_msd'
os.makedirs(dataset_dir, exist_ok=True)
Expand Down
10 changes: 9 additions & 1 deletion sklearn_bench/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def main():

def fit_kmeans(X, X_init):
alg = KMeans(n_clusters=params.n_clusters, tol=params.tol,
max_iter=params.maxiter, init=X_init, n_init=1)
max_iter=params.maxiter, init=X_init, n_init=params.n_init,
algorithm=params.algorithm, random_state=params.random_state)
alg.fit(X)
return alg

Expand Down Expand Up @@ -83,5 +84,12 @@ def fit_kmeans(X, X_init):
parser.add_argument('--maxiter', type=int, default=100,
help='Maximum number of iterations')
parser.add_argument('--n-clusters', type=int, help='Number of clusters')
parser.add_argument('--algorithm', type=str, default='full',
help='K-means algorithm to use')
parser.add_argument('--n_init', type=int, default=10,
help='Number of time the k-means algorithm '
'will be run with different centroid seeds')
parser.add_argument('--random_state', type=int, default=777,
help='Random state')
params = bench.parse_args(parser)
bench.run_with_context(params, main)