-
Notifications
You must be signed in to change notification settings - Fork 229
[MRG+1] Threshold for pairs learners #168
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
676ab86
cc1c3e6
f95c456
9ffe8f7
3354fb1
12cb5f1
dd8113e
1c8cd29
d12729a
dc9e21d
402729f
aaac3de
e5b1e47
a0cb3ca
8d5fc50
0f14b25
a6458a2
fada5cc
32a4889
5cf71b9
c2bc693
e96ee00
3ed3430
69c6945
bc39392
facc546
f0ca65e
a6ec283
49fbbd7
960b174
c91acf7
a742186
9ec1ead
986fed3
3f5d6d1
7b5e4dd
a3ec02c
ccc66eb
6dff15b
719d018
551d161
594c485
14713c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
from numpy.linalg import cholesky | ||
from scipy.spatial.distance import euclidean | ||
from sklearn.base import BaseEstimator | ||
from sklearn.utils.validation import _is_arraylike | ||
from sklearn.metrics import roc_auc_score | ||
from sklearn.utils.validation import _is_arraylike, check_is_fitted | ||
from sklearn.metrics import roc_auc_score, accuracy_score | ||
import numpy as np | ||
from abc import ABCMeta, abstractmethod | ||
import six | ||
|
@@ -317,7 +317,8 @@ def predict(self, pairs): | |
y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) | ||
The predicted learned metric value between samples in every pair. | ||
""" | ||
return self.decision_function(pairs) | ||
check_is_fitted(self, ['threshold_', 'transformer_']) | ||
return - 2 * (self.decision_function(pairs) > self.threshold_) + 1 | ||
|
||
def decision_function(self, pairs): | ||
"""Returns the learned metric between input pairs. | ||
|
@@ -369,6 +370,13 @@ def score(self, pairs, y): | |
""" | ||
return roc_auc_score(y, self.decision_function(pairs)) | ||
|
||
def set_default_threshold(self, pairs, y): | ||
"""Returns a threshold that is the mean between the similar metrics | ||
mean, and the dissimilar metrics mean""" | ||
similar_threshold = np.mean(self.decision_function(pairs[y==1])) | ||
dissimilar_threshold = np.mean(self.decision_function(pairs[y==1])) | ||
self.threshold_ = np.mean([similar_threshold, dissimilar_threshold]) | ||
|
||
|
||
class _QuadrupletsClassifierMixin(BaseMetricLearner): | ||
|
||
|
@@ -393,6 +401,7 @@ def predict(self, quadruplets): | |
prediction : `numpy.ndarray` of floats, shape=(n_constraints,) | ||
Predictions of the ordering of pairs, for each quadruplet. | ||
""" | ||
check_is_fitted(self, 'transformer_') | ||
quadruplets = check_input(quadruplets, type_of_inputs='tuples', | ||
preprocessor=self.preprocessor_, | ||
estimator=self, tuple_size=self._tuple_size) | ||
|
@@ -435,11 +444,22 @@ def score(self, quadruplets, y=None): | |
points, or 2D array of indices of quadruplets if the metric learner | ||
uses a preprocessor. | ||
|
||
y : Ignored, for scikit-learn compatibility. | ||
y : array-like, shape=(n_constraints,) or `None` | ||
Labels of constraints. y[i] should be 1 if | ||
d(pairs[i, 0], X[i, 1]) is wanted to be larger than | ||
d(X[i, 2], X[i, 3]), and -1 if it is wanted to be smaller. If None, | ||
`y` will be set to `np.ones(quadruplets.shape[0])`, i.e. we want all | ||
first two points to be closer than the last two points in each | ||
quadruplet. | ||
|
||
Returns | ||
------- | ||
score : float | ||
The quadruplets score. | ||
""" | ||
return -np.mean(self.predict(quadruplets)) | ||
quadruplets = check_input(quadruplets, y, type_of_inputs='tuples', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note that here quadruplets will be checked twice (once here, once in predict). This is because when I do |
||
preprocessor=self.preprocessor_, | ||
estimator=self, tuple_size=self._tuple_size) | ||
if y is None: | ||
y = np.ones(quadruplets.shape[0]) | ||
return accuracy_score(y, self.predict(quadruplets)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -148,6 +148,11 @@ class ITML(_BaseITML, _PairsClassifierMixin): | |
transformer_ : `numpy.ndarray`, shape=(num_dims, n_features) | ||
The linear transformation ``L`` deduced from the learned Mahalanobis | ||
metric (See function `transformer_from_metric`.) | ||
|
||
threshold_ : `float` | ||
If the distance metric between two points is lower than this threshold, | ||
points will be classified as similar, otherwise they will be | ||
classified as dissimilar. | ||
""" | ||
|
||
def fit(self, pairs, y, bounds=None): | ||
|
@@ -176,7 +181,9 @@ def fit(self, pairs, y, bounds=None): | |
self : object | ||
Returns the instance. | ||
""" | ||
return self._fit(pairs, y, bounds=bounds) | ||
self._fit(pairs, y, bounds=bounds) | ||
self.threshold_ = np.mean(self.bounds_) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess if the above is yes you mean remove this and replace it by |
||
return self | ||
|
||
|
||
class ITML_Supervised(_BaseITML, TransformerMixin): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import pytest | ||
from sklearn.exceptions import NotFittedError | ||
from sklearn.model_selection import train_test_split | ||
|
||
from test.test_utils import pairs_learners, ids_pairs_learners | ||
from sklearn.utils.testing import set_random_state | ||
from sklearn import clone | ||
import numpy as np | ||
|
||
|
||
@pytest.mark.parametrize('with_preprocessor', [True, False]) | ||
@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, | ||
ids=ids_pairs_learners) | ||
def test_predict_only_one_or_minus_one(estimator, build_dataset, | ||
with_preprocessor): | ||
"""Test that all predicted values are either +1 or -1""" | ||
input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) | ||
estimator = clone(estimator) | ||
estimator.set_params(preprocessor=preprocessor) | ||
set_random_state(estimator) | ||
pairs_train, pairs_test, y_train, y_test = train_test_split(input_data, | ||
labels) | ||
estimator.fit(pairs_train, y_train) | ||
predictions = estimator.predict(pairs_test) | ||
assert np.isin(predictions, [-1, 1]).all() | ||
|
||
|
||
@pytest.mark.parametrize('with_preprocessor', [True, False]) | ||
@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, | ||
ids=ids_pairs_learners) | ||
def test_predict_monotonous(estimator, build_dataset, | ||
with_preprocessor): | ||
"""Test that there is a threshold distance separating points labeled as | ||
similar and points labeled as dissimilar """ | ||
input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) | ||
estimator = clone(estimator) | ||
estimator.set_params(preprocessor=preprocessor) | ||
set_random_state(estimator) | ||
pairs_train, pairs_test, y_train, y_test = train_test_split(input_data, | ||
labels) | ||
estimator.fit(pairs_train, y_train) | ||
distances = estimator.score_pairs(pairs_test) | ||
predictions = estimator.predict(pairs_test) | ||
min_dissimilar = np.min(distances[predictions == -1]) | ||
max_similar = np.max(distances[predictions == 1]) | ||
assert max_similar <= min_dissimilar | ||
separator = np.mean([min_dissimilar, max_similar]) | ||
assert (predictions[distances > separator] == -1).all() | ||
assert (predictions[distances < separator] == 1).all() | ||
|
||
|
||
@pytest.mark.parametrize('with_preprocessor', [True, False]) | ||
@pytest.mark.parametrize('estimator, build_dataset', pairs_learners, | ||
ids=ids_pairs_learners) | ||
def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset, | ||
with_preprocessor): | ||
"""Test that a NotFittedError is raised if someone tries to predict and | ||
the metric learner has not been fitted.""" | ||
input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) | ||
estimator = clone(estimator) | ||
estimator.set_params(preprocessor=preprocessor) | ||
set_random_state(estimator) | ||
with pytest.raises(NotFittedError): | ||
estimator.predict(input_data) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import pytest | ||
from sklearn.exceptions import NotFittedError | ||
from sklearn.model_selection import train_test_split | ||
|
||
from test.test_utils import quadruplets_learners, ids_quadruplets_learners | ||
from sklearn.utils.testing import set_random_state | ||
from sklearn import clone | ||
import numpy as np | ||
|
||
|
||
@pytest.mark.parametrize('with_preprocessor', [True, False]) | ||
@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners, | ||
ids=ids_quadruplets_learners) | ||
def test_predict_only_one_or_minus_one(estimator, build_dataset, | ||
with_preprocessor): | ||
"""Test that all predicted values are either +1 or -1""" | ||
input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) | ||
estimator = clone(estimator) | ||
estimator.set_params(preprocessor=preprocessor) | ||
set_random_state(estimator) | ||
(quadruplets_train, | ||
quadruplets_test, y_train, y_test) = train_test_split(input_data, labels) | ||
estimator.fit(quadruplets_train, y_train) | ||
predictions = estimator.predict(quadruplets_test) | ||
assert np.isin(predictions, [-1, 1]).all() | ||
|
||
|
||
@pytest.mark.parametrize('with_preprocessor', [True, False]) | ||
@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners, | ||
ids=ids_quadruplets_learners) | ||
def test_predict_monotonous(estimator, build_dataset, | ||
with_preprocessor): | ||
"""Test that there is a threshold distance separating points labeled as | ||
similar and points labeled as dissimilar """ | ||
input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) | ||
estimator = clone(estimator) | ||
estimator.set_params(preprocessor=preprocessor) | ||
set_random_state(estimator) | ||
(quadruplets_train, | ||
quadruplets_test, y_train, y_test) = train_test_split(input_data, labels) | ||
estimator.fit(quadruplets_train, y_train) | ||
distances = estimator.score_quadruplets(quadruplets_test) | ||
predictions = estimator.predict(quadruplets_test) | ||
min_dissimilar = np.min(distances[predictions == -1]) | ||
max_similar = np.max(distances[predictions == 1]) | ||
assert max_similar <= min_dissimilar | ||
separator = np.mean([min_dissimilar, max_similar]) | ||
assert (predictions[distances > separator] == -1).all() | ||
assert (predictions[distances < separator] == 1).all() | ||
|
||
|
||
@pytest.mark.parametrize('with_preprocessor', [True, False]) | ||
@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners, | ||
ids=ids_quadruplets_learners) | ||
def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset, | ||
with_preprocessor): | ||
"""Test that a NotFittedError is raised if someone tries to predict and | ||
the metric learner has not been fitted.""" | ||
input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) | ||
estimator = clone(estimator) | ||
estimator.set_params(preprocessor=preprocessor) | ||
set_random_state(estimator) | ||
with pytest.raises(NotFittedError): | ||
estimator.predict(input_data) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
to remove if indeed we choose the accuracy-calibrated threshold as default