-
Notifications
You must be signed in to change notification settings - Fork 73
Extend output result & minor fixes #81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
c9f6165
7c33dce
46bf478
e739eba
957465f
bc7e964
a37921c
4b22fb6
33794ed
28b8185
069e15f
e0a6d74
4a49d18
2310156
be16c37
6508ca2
a4f3b70
3afb8b7
501d328
4412444
6f58886
9fa4a94
070c4d2
c7b7a5c
8c8f964
ff9da3b
9871f94
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,9 +30,9 @@ def get_dtype(data): | |
''' | ||
if hasattr(data, 'dtype'): | ||
return data.dtype | ||
elif hasattr(data, 'dtypes'): | ||
if hasattr(data, 'dtypes'): | ||
return str(data.dtypes[0]) | ||
elif hasattr(data, 'values'): | ||
if hasattr(data, 'values'): | ||
return data.values.dtype | ||
else: | ||
raise ValueError(f'Impossible to get data type of {type(data)}') | ||
|
@@ -66,10 +66,7 @@ def _parse_size(string, dim=2): | |
|
||
|
||
def float_or_int(string): | ||
if '.' in string: | ||
return float(string) | ||
else: | ||
return int(string) | ||
return float(string) if '.' in string else int(string) | ||
|
||
|
||
def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): | ||
|
@@ -90,10 +87,8 @@ def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64): | |
optimal_cache_size_bytes = byte_size * (n_rows ** 2) | ||
one_gb = 2 ** 30 | ||
max_cache_bytes = max_cache * one_gb | ||
if optimal_cache_size_bytes > max_cache_bytes: | ||
return max_cache_bytes | ||
else: | ||
return optimal_cache_size_bytes | ||
return max_cache_bytes if optimal_cache_size_bytes > max_cache_bytes \ | ||
else optimal_cache_size_bytes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An extra comma at the end of line 175 |
||
|
||
|
||
def parse_args(parser, size=None, loop_types=(), | ||
|
@@ -324,34 +319,47 @@ def convert_to_numpy(data): | |
return data | ||
|
||
|
||
def columnwise_score(y, yp, score_func): | ||
def accuracy_score(y, yp): | ||
from sklearn.metrics import accuracy_score as sklearn_accuracy | ||
y = convert_to_numpy(y) | ||
yp = convert_to_numpy(yp) | ||
if y.ndim + yp.ndim > 2: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is that code really useless? does sklearn_accuracy work same? |
||
if 1 in (y.shape + yp.shape)[1:]: | ||
if y.ndim > 1: | ||
y = y[:, 0] | ||
if yp.ndim > 1: | ||
yp = yp[:, 0] | ||
else: | ||
return [score_func(y[i], yp[i]) for i in range(y.shape[1])] | ||
return score_func(y, yp) | ||
|
||
|
||
def accuracy_score(y, yp): | ||
return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) | ||
return sklearn_accuracy(y, yp) | ||
|
||
|
||
def log_loss(y, yp): | ||
from sklearn.metrics import log_loss as sklearn_log_loss | ||
y = convert_to_numpy(y) | ||
yp = convert_to_numpy(yp) | ||
return sklearn_log_loss(y, yp) | ||
try: | ||
res = sklearn_log_loss(y, yp) | ||
except Exception: | ||
res = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handling exceptions like this is a bad practice: every mistake will be hidden by this code |
||
return res | ||
|
||
|
||
def roc_auc_score(y, yp, multi_class='ovr'): | ||
from sklearn.metrics import roc_auc_score as sklearn_roc_auc | ||
y = convert_to_numpy(y) | ||
yp = convert_to_numpy(yp) | ||
try: | ||
res = sklearn_roc_auc(y, yp, multi_class=multi_class) | ||
except Exception: | ||
res = None | ||
return res | ||
|
||
|
||
def rmse_score(y, yp): | ||
return columnwise_score( | ||
y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) | ||
from sklearn.metrics import mean_squared_error as sklearn_mse | ||
y = convert_to_numpy(y) | ||
yp = convert_to_numpy(yp) | ||
return sklearn_mse(y, yp) | ||
|
||
|
||
def r2_score(y, yp): | ||
from sklearn.metrics import r2_score as sklearn_r2_score | ||
y = convert_to_numpy(y) | ||
yp = convert_to_numpy(yp) | ||
return sklearn_r2_score(y, yp) | ||
|
||
|
||
def convert_data(data, dtype, data_order, data_format): | ||
|
@@ -367,14 +375,11 @@ def convert_data(data, dtype, data_order, data_format): | |
# Secondly, change format of data | ||
if data_format == 'numpy': | ||
return data | ||
elif data_format == 'pandas': | ||
if data_format == 'pandas': | ||
import pandas as pd | ||
|
||
if data.ndim == 1: | ||
return pd.Series(data) | ||
else: | ||
return pd.DataFrame(data) | ||
elif data_format == 'cudf': | ||
return pd.Series(data) if data.ndim == 1 else pd.DataFrame(data) | ||
if data_format == 'cudf': | ||
import cudf | ||
import pandas as pd | ||
|
||
|
@@ -497,7 +502,12 @@ def print_output(library, algorithm, stages, params, functions, | |
data[i], alg_instance, alg_params) | ||
result.update({'time[s]': times[i]}) | ||
if accuracy_type is not None: | ||
result.update({f'{accuracy_type}': accuracies[i]}) | ||
if isinstance(accuracy_type, str): | ||
result.update({f'{accuracy_type}': accuracies[i]}) | ||
elif isinstance(accuracy_type, list): | ||
for ind, val in enumerate(accuracy_type): | ||
if accuracies[ind][i] is not None: | ||
result.update({f'{val}': accuracies[ind][i]}) | ||
if hasattr(params, 'n_classes'): | ||
result['input_data'].update({'classes': params.n_classes}) | ||
if hasattr(params, 'n_clusters'): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -307,7 +307,7 @@ | |
} | ||
], | ||
"nu": [0.25], | ||
"kernel": ["sigmoid"] | ||
"kernel": ["poly"] | ||
}, | ||
{ | ||
"algorithm": "svr", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,7 +40,10 @@ def main(): | |
labels = dbscan.labels_ | ||
|
||
params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) | ||
acc = davies_bouldin_score(X, labels) | ||
try: | ||
acc = davies_bouldin_score(X, labels) | ||
except Exception: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be better to have all scoring functions in the same place (bench.py) |
||
acc = -1 | ||
|
||
bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], | ||
params=params, functions=['DBSCAN'], times=[time], | ||
|
@@ -50,7 +53,7 @@ def main(): | |
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description='scikit-learn DBSCAN benchmark') | ||
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10., | ||
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=0.5, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why did you change it? this will not affect the measurements of the current configs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is default value in sklearn There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this will not affect the measurements of the current configs? |
||
help='Radius of neighborhood of a point') | ||
parser.add_argument('-m', '--min-samples', default=5, type=int, | ||
help='The minimum number of samples required in a ' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just raise an exception here, without
else