From 676ab8641d8e7fabf851848bd2265061fcf6d194 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 4 Feb 2019 14:39:24 +0100
Subject: [PATCH 01/41] add some tests for testing that different scores work
 using the scoring function

---
 test/test_sklearn_compat.py | 66 ++++++++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 15 deletions(-)

diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index d9dce685..5dbabf98 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -15,9 +15,11 @@
 import numpy as np
 from sklearn.model_selection import (cross_val_score, cross_val_predict,
                                      train_test_split, KFold)
+from sklearn.metrics.scorer import make_scorer
 from sklearn.utils.testing import _get_args
 from test.test_utils import (metric_learners, ids_metric_learners,
-                             mock_preprocessor)
+                             mock_preprocessor, tuples_learners,
+                             ids_tuples_learners)
 
 
 # Wrap the _Supervised methods with a deterministic wrapper for testing.
@@ -88,22 +90,56 @@ def test_mmc(self):
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
-@pytest.mark.parametrize('estimator, build_dataset', metric_learners,
-                         ids=ids_metric_learners)
-def test_cross_validation_is_finite(estimator, build_dataset,
-                                    with_preprocessor):
+@pytest.mark.parametrize('estimator, build_dataset', tuples_learners,
+                         ids=ids_tuples_learners)
+def test_various_scoring_on_tuples_learners(estimator, build_dataset,
+                                            with_preprocessor):
+  """Tests that metric-learn estimators' scoring returns something finite,
+  for other scoring than default scoring. (List of scikit-learn's scores can be
+  found in sklearn.metrics.scorer). For each type of output (predict,
+  predict_proba, decision_function), we test a bunch of scores.
+  """
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+
+  # scores that need a predict function: every tuples learner should have a
+  # predict function (whether the pair is of positive samples or negative
+  # samples)
+  for scoring in ['accuracy', 'f1', 'precision', 'recall']:
+    check_score_is_finite(scoring, estimator, input_data, labels)
+  # scores that need a predict_proba:
+  if hasattr(estimator, "predict_proba"):
+    for scoring in ['neg_log_loss', 'brier_score']:
+      check_score_is_finite(scoring, estimator, input_data, labels)
+  # scores that need a decision_function: every tuples learner should have a
+  # decision function (the metric between points)
+  for scoring in ['roc_auc', 'average_precision', 'average_recall']:
+    check_score_is_finite(scoring, estimator, input_data, labels)
+
+
+def check_score_is_finite(scoring, estimator, input_data, labels):
+    assert np.isfinite(cross_val_score(estimator, input_data, labels,
+                                       scoring=scoring)).all()
+    assert np.isfinite(cross_val_predict(estimator,
+                                         input_data, labels,
+                                         scoring=scoring)).all()
+    assert np.isfinite(make_scorer(scoring)(input_data, labels))
+
+
+@pytest.mark.parametrize('estimator, build_dataset', tuples_learners,
+                         ids=ids_tuples_learners)
+def test_cross_validation_is_finite(estimator, build_dataset):
   """Tests that validation on metric-learn estimators returns something finite
   """
-  if any(hasattr(estimator, method) for method in ["predict", "score"]):
-    input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
-    estimator = clone(estimator)
-    estimator.set_params(preprocessor=preprocessor)
-    set_random_state(estimator)
-    if hasattr(estimator, "score"):
-      assert np.isfinite(cross_val_score(estimator, input_data, labels)).all()
-    if hasattr(estimator, "predict"):
-      assert np.isfinite(cross_val_predict(estimator,
-                                           input_data, labels)).all()
+  input_data, labels, preprocessor, _ = build_dataset()
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  assert np.isfinite(cross_val_score(estimator, input_data, labels)).all()
+  assert np.isfinite(cross_val_predict(estimator,
+                                       input_data, labels)).all()
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])

From cc1c3e63709819f154c147325122a50469347bc2 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 5 Feb 2019 11:12:26 +0100
Subject: [PATCH 02/41] ENH: Add tests and basic threshold implementation

---
 metric_learn/base_metric.py    | 12 +++++--
 metric_learn/itml.py           |  9 ++++-
 metric_learn/mmc.py            |  9 ++++-
 metric_learn/sdml.py           |  9 ++++-
 test/test_pairs_classifiers.py | 65 ++++++++++++++++++++++++++++++++++
 test/test_sklearn_compat.py    | 13 ++++---
 6 files changed, 105 insertions(+), 12 deletions(-)
 create mode 100644 test/test_pairs_classifiers.py

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 58b8cc5d..1cf1ec36 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -1,7 +1,7 @@
 from numpy.linalg import cholesky
 from scipy.spatial.distance import euclidean
 from sklearn.base import BaseEstimator
-from sklearn.utils.validation import _is_arraylike
+from sklearn.utils.validation import _is_arraylike, check_is_fitted
 from sklearn.metrics import roc_auc_score
 import numpy as np
 from abc import ABCMeta, abstractmethod
@@ -317,7 +317,8 @@ def predict(self, pairs):
     y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
       The predicted learned metric value between samples in every pair.
     """
-    return self.decision_function(pairs)
+    check_is_fitted(self, 'threshold_')
+    return - 2 * (self.decision_function(pairs) > self.threshold_) + 1
 
   def decision_function(self, pairs):
     """Returns the learned metric between input pairs.
@@ -369,6 +370,13 @@ def score(self, pairs, y):
     """
     return roc_auc_score(y, self.decision_function(pairs))
 
+  def set_default_threshold(self, pairs, y):
+    """Returns a threshold that is the mean between the similar metrics
+    mean, and the dissimilar metrics mean"""
+    similar_threshold = np.mean(self.decision_function(pairs[y==1]))
+    dissimilar_threshold = np.mean(self.decision_function(pairs[y==1]))
+    self.threshold_ = np.mean([similar_threshold, dissimilar_threshold])
+
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
 
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index a0ff05f9..2a9e987a 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -148,6 +148,11 @@ class ITML(_BaseITML, _PairsClassifierMixin):
   transformer_ : `numpy.ndarray`, shape=(num_dims, n_features)
       The linear transformation ``L`` deduced from the learned Mahalanobis
       metric (See function `transformer_from_metric`.)
+
+  threshold_ : `float`
+      If the distance metric between two points is lower than this threshold,
+      points will be classified as similar, otherwise they will be
+      classified as dissimilar.
   """
 
   def fit(self, pairs, y, bounds=None):
@@ -176,7 +181,9 @@ def fit(self, pairs, y, bounds=None):
     self : object
         Returns the instance.
     """
-    return self._fit(pairs, y, bounds=bounds)
+    self._fit(pairs, y, bounds=bounds)
+    self.threshold_ = np.mean(self.bounds_)
+    return self
 
 
 class ITML_Supervised(_BaseITML, TransformerMixin):
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index f9d3690b..2ddcced2 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -359,6 +359,11 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
   transformer_ : `numpy.ndarray`, shape=(num_dims, n_features)
       The linear transformation ``L`` deduced from the learned Mahalanobis
       metric (See function `transformer_from_metric`.)
+
+  threshold_ : `float`
+      If the distance metric between two points is lower than this threshold,
+      points will be classified as similar, otherwise they will be
+      classified as dissimilar.
   """
 
   def fit(self, pairs, y):
@@ -379,7 +384,9 @@ def fit(self, pairs, y):
     self : object
         Returns the instance.
     """
-    return self._fit(pairs, y)
+    self._fit(pairs, y)
+    self.set_default_threshold(pairs, y)
+    return self
 
 
 class MMC_Supervised(_BaseMMC, TransformerMixin):
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 78fc4ebc..096dc0ed 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -81,6 +81,11 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
   transformer_ : `numpy.ndarray`, shape=(num_dims, n_features)
       The linear transformation ``L`` deduced from the learned Mahalanobis
       metric (See function `transformer_from_metric`.)
+
+  threshold_ : `float`
+      If the distance metric between two points is lower than this threshold,
+      points will be classified as similar, otherwise they will be
+      classified as dissimilar.
   """
 
   def fit(self, pairs, y):
@@ -101,7 +106,9 @@ def fit(self, pairs, y):
     self : object
         Returns the instance.
     """
-    return self._fit(pairs, y)
+    self._fit(pairs, y)
+    self.set_default_threshold(pairs, y)
+    return self
 
 
 class SDML_Supervised(_BaseSDML, TransformerMixin):
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
new file mode 100644
index 00000000..c497f64b
--- /dev/null
+++ b/test/test_pairs_classifiers.py
@@ -0,0 +1,65 @@
+import pytest
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import train_test_split
+
+from test.test_utils import pairs_learners, ids_pairs_learners
+from sklearn.utils.testing import set_random_state
+from sklearn import clone
+import numpy as np
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_predict_monotonous(estimator, build_dataset,
+                                            with_preprocessor):
+  """Test that all predicted values are either +1 or -1"""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  pairs_train, pairs_test, y_train, y_test = train_test_split(input_data,
+                                                              labels)
+  estimator.fit(pairs_train, y_train)
+  predictions = estimator.predict(pairs_test, y_test)
+  assert np.isin(predictions, [-1, 1]).all()
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_predict_monotonous(estimator, build_dataset,
+                                            with_preprocessor):
+  """Test that there is a threshold distance separating points labeled as
+  similar and points labeled as dissimilar """
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  pairs_train, pairs_test, y_train, y_test = train_test_split(input_data,
+                                                              labels)
+  estimator.fit(pairs_train, y_train)
+  distances = estimator.score_pairs(pairs_test)
+  predictions = estimator.predict(pairs_test)
+  min_dissimilar = np.min(distances[predictions == -1])
+  max_similar = np.max(distances[predictions == 1])
+  assert max_similar <= min_dissimilar
+  separator = np.mean([min_dissimilar, max_similar])
+  assert (predictions[distances > separator] == -1).all()
+  assert (predictions[distances < separator] == 1).all()
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
+                                            with_preprocessor):
+  """Test that a NotFittedError is raised if someone tries to predict and
+  the metric learner has not been fitted."""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  with pytest.raises(NotFittedError):
+    estimator.predict(input_data)
+
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index 5dbabf98..3e2b9113 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -15,7 +15,7 @@
 import numpy as np
 from sklearn.model_selection import (cross_val_score, cross_val_predict,
                                      train_test_split, KFold)
-from sklearn.metrics.scorer import make_scorer
+from sklearn.metrics.scorer import get_scorer
 from sklearn.utils.testing import _get_args
 from test.test_utils import (metric_learners, ids_metric_learners,
                              mock_preprocessor, tuples_learners,
@@ -107,7 +107,7 @@ def test_various_scoring_on_tuples_learners(estimator, build_dataset,
   # scores that need a predict function: every tuples learner should have a
   # predict function (whether the pair is of positive samples or negative
   # samples)
-  for scoring in ['accuracy', 'f1', 'precision', 'recall']:
+  for scoring in ['accuracy', 'f1']:
     check_score_is_finite(scoring, estimator, input_data, labels)
   # scores that need a predict_proba:
   if hasattr(estimator, "predict_proba"):
@@ -115,17 +115,16 @@ def test_various_scoring_on_tuples_learners(estimator, build_dataset,
       check_score_is_finite(scoring, estimator, input_data, labels)
   # scores that need a decision_function: every tuples learner should have a
   # decision function (the metric between points)
-  for scoring in ['roc_auc', 'average_precision', 'average_recall']:
+  for scoring in ['roc_auc', 'average_precision', 'precision', 'recall']:
     check_score_is_finite(scoring, estimator, input_data, labels)
 
 
 def check_score_is_finite(scoring, estimator, input_data, labels):
+    estimator = clone(estimator)
     assert np.isfinite(cross_val_score(estimator, input_data, labels,
                                        scoring=scoring)).all()
-    assert np.isfinite(cross_val_predict(estimator,
-                                         input_data, labels,
-                                         scoring=scoring)).all()
-    assert np.isfinite(make_scorer(scoring)(input_data, labels))
+    estimator.fit(input_data, labels)
+    assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
 
 
 @pytest.mark.parametrize('estimator, build_dataset', tuples_learners,

From f95c456a8cc1de441ff0fc21131c2472edbaae27 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 6 Feb 2019 09:34:24 +0100
Subject: [PATCH 03/41] Add support for LSML and more generally quadruplets

---
 metric_learn/base_metric.py          | 20 +++++++--
 metric_learn/lsml.py                 | 24 +++++++---
 test/test_pairs_classifiers.py       |  8 ++--
 test/test_quadruplets_classifiers.py | 65 ++++++++++++++++++++++++++++
 4 files changed, 102 insertions(+), 15 deletions(-)
 create mode 100644 test/test_quadruplets_classifiers.py

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 1cf1ec36..6711efb8 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -2,7 +2,7 @@
 from scipy.spatial.distance import euclidean
 from sklearn.base import BaseEstimator
 from sklearn.utils.validation import _is_arraylike, check_is_fitted
-from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_auc_score, accuracy_score
 import numpy as np
 from abc import ABCMeta, abstractmethod
 import six
@@ -317,7 +317,7 @@ def predict(self, pairs):
     y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
       The predicted learned metric value between samples in every pair.
     """
-    check_is_fitted(self, 'threshold_')
+    check_is_fitted(self, ['threshold_', 'transformer_'])
     return - 2 * (self.decision_function(pairs) > self.threshold_) + 1
 
   def decision_function(self, pairs):
@@ -401,6 +401,7 @@ def predict(self, quadruplets):
     prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
       Predictions of the ordering of pairs, for each quadruplet.
     """
+    check_is_fitted(self, 'transformer_')
     quadruplets = check_input(quadruplets, type_of_inputs='tuples',
                               preprocessor=self.preprocessor_,
                               estimator=self, tuple_size=self._tuple_size)
@@ -443,11 +444,22 @@ def score(self, quadruplets, y=None):
       points, or 2D array of indices of quadruplets if the metric learner
       uses a preprocessor.
 
-    y : Ignored, for scikit-learn compatibility.
+    y : array-like, shape=(n_constraints,) or `None`
+      Labels of constraints. y[i] should be 1 if
+      d(pairs[i, 0], X[i, 1]) is wanted to be larger than
+      d(X[i, 2], X[i, 3]), and -1 if it is wanted to be smaller. If None,
+      `y` will be set to `np.ones(quadruplets.shape[0])`, i.e. we want all
+      first two points to be closer than the last two points in each
+      quadruplet.
 
     Returns
     -------
     score : float
       The quadruplets score.
     """
-    return -np.mean(self.predict(quadruplets))
+    quadruplets = check_input(quadruplets, y, type_of_inputs='tuples',
+                              preprocessor=self.preprocessor_,
+                              estimator=self, tuple_size=self._tuple_size)
+    if y is None:
+      y = np.ones(quadruplets.shape[0])
+    return accuracy_score(y, self.predict(quadruplets))
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index 312990ab..b1f65c48 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -46,9 +46,15 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False,
     super(_BaseLSML, self).__init__(preprocessor)
 
   def _fit(self, quadruplets, y=None, weights=None):
-    quadruplets = self._prepare_inputs(quadruplets,
+    quadruplets = self._prepare_inputs(quadruplets, y,
                                        type_of_inputs='tuples')
-
+    if y is None:
+      y = np.ones(quadruplets.shape[0])
+    # we swap the quadruplets where the label is -1 since they are not in
+    # the right order
+    quadruplets_to_swap = quadruplets[y == -1]
+    quadruplets[y == -1] = np.column_stack([quadruplets_to_swap[:, 2:],
+                                            quadruplets_to_swap[:, :2]])
     # check to make sure that no two constrained vectors are identical
     vab = quadruplets[:, 0, :] - quadruplets[:, 1, :]
     vcd = quadruplets[:, 2, :] - quadruplets[:, 3, :]
@@ -144,7 +150,7 @@ class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
       metric (See function `transformer_from_metric`.)
   """
 
-  def fit(self, quadruplets, weights=None):
+  def fit(self, quadruplets, y=None, weights=None):
     """Learn the LSML model.
 
     Parameters
@@ -152,10 +158,14 @@ def fit(self, quadruplets, weights=None):
     quadruplets : array-like, shape=(n_constraints, 4, n_features) or
                   (n_constraints, 4)
         3D array-like of quadruplets of points or 2D array of quadruplets of
-        indicators. In order to supervise the algorithm in the right way, we
-        should have the four samples ordered in a way such that:
-        d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i <
-        n_constraints.
+        indicators.
+    y : array-like, shape=(n_constraints,) or `None`
+        Labels of constraints. y[i] should be 1 if
+        d(pairs[i, 0], X[i, 1]) is wanted to be larger than
+        d(X[i, 2], X[i, 3]), and -1 if it is wanted to be smaller. If None,
+        `y` will be set to `np.ones(quadruplets.shape[0])`, i.e. we want to
+        put all first two points closer than the last two points in each
+        quadruplet.
     weights : (n_constraints,) array of floats, optional
         scale factor for each constraint
 
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index c497f64b..34d107ea 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -11,8 +11,8 @@
 @pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)
-def test_predict_monotonous(estimator, build_dataset,
-                                            with_preprocessor):
+def test_predict_only_one_or_minus_one(estimator, build_dataset,
+                                       with_preprocessor):
   """Test that all predicted values are either +1 or -1"""
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
@@ -21,7 +21,7 @@ def test_predict_monotonous(estimator, build_dataset,
   pairs_train, pairs_test, y_train, y_test = train_test_split(input_data,
                                                               labels)
   estimator.fit(pairs_train, y_train)
-  predictions = estimator.predict(pairs_test, y_test)
+  predictions = estimator.predict(pairs_test)
   assert np.isin(predictions, [-1, 1]).all()
 
 
@@ -29,7 +29,7 @@ def test_predict_monotonous(estimator, build_dataset,
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)
 def test_predict_monotonous(estimator, build_dataset,
-                                            with_preprocessor):
+                            with_preprocessor):
   """Test that there is a threshold distance separating points labeled as
   similar and points labeled as dissimilar """
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
diff --git a/test/test_quadruplets_classifiers.py b/test/test_quadruplets_classifiers.py
new file mode 100644
index 00000000..b272a52d
--- /dev/null
+++ b/test/test_quadruplets_classifiers.py
@@ -0,0 +1,65 @@
+import pytest
+from sklearn.exceptions import NotFittedError
+from sklearn.model_selection import train_test_split
+
+from test.test_utils import quadruplets_learners, ids_quadruplets_learners
+from sklearn.utils.testing import set_random_state
+from sklearn import clone
+import numpy as np
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
+                         ids=ids_quadruplets_learners)
+def test_predict_only_one_or_minus_one(estimator, build_dataset,
+                                       with_preprocessor):
+  """Test that all predicted values are either +1 or -1"""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  (quadruplets_train,
+   quadruplets_test, y_train, y_test) = train_test_split(input_data, labels)
+  estimator.fit(quadruplets_train, y_train)
+  predictions = estimator.predict(quadruplets_test)
+  assert np.isin(predictions, [-1, 1]).all()
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
+                         ids=ids_quadruplets_learners)
+def test_predict_monotonous(estimator, build_dataset,
+                            with_preprocessor):
+  """Test that there is a threshold distance separating points labeled as
+  similar and points labeled as dissimilar """
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  (quadruplets_train,
+   quadruplets_test, y_train, y_test) = train_test_split(input_data, labels)
+  estimator.fit(quadruplets_train, y_train)
+  distances = estimator.score_quadruplets(quadruplets_test)
+  predictions = estimator.predict(quadruplets_test)
+  min_dissimilar = np.min(distances[predictions == -1])
+  max_similar = np.max(distances[predictions == 1])
+  assert max_similar <= min_dissimilar
+  separator = np.mean([min_dissimilar, max_similar])
+  assert (predictions[distances > separator] == -1).all()
+  assert (predictions[distances < separator] == 1).all()
+
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
+                         ids=ids_quadruplets_learners)
+def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
+                                            with_preprocessor):
+  """Test that a NotFittedError is raised if someone tries to predict and
+  the metric learner has not been fitted."""
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  with pytest.raises(NotFittedError):
+    estimator.predict(input_data)
+

From 9ffe8f74e6cd8f926770a5aaf16d05fbc37a059a Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 6 Feb 2019 16:32:52 +0100
Subject: [PATCH 04/41] Make CalibratedClassifierCV work (for preprocessor
 case) thanks to classes_

---
 metric_learn/base_metric.py |  1 +
 metric_learn/itml.py        |  5 +++++
 metric_learn/mmc.py         |  5 +++++
 metric_learn/sdml.py        |  5 +++++
 test/test_sklearn_compat.py | 29 ++++++++++++++++++++++++++++-
 5 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 6711efb8..079968bb 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -296,6 +296,7 @@ def get_mahalanobis_matrix(self):
 
 class _PairsClassifierMixin(BaseMetricLearner):
 
+  classes_ = [-1, 1]
   _tuple_size = 2  # number of points in a tuple, 2 for pairs
 
   def predict(self, pairs):
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 2a9e987a..aa75463e 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -153,6 +153,11 @@ class ITML(_BaseITML, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
+
+  classes_ : `list`
+      The possible labels of the pairs `LSML` can fit on. `classes_ = [-1, 1]`,
+      where -1 means points in a pair are dissimilar (negative label), and 1
+      means they are similar (positive label).
   """
 
   def fit(self, pairs, y, bounds=None):
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 2ddcced2..138b1d71 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -364,6 +364,11 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
+
+  classes_ : `list`
+      The possible labels of the pairs `MMC` can fit on. `classes_ = [-1, 1]`,
+      where -1 means points in a pair are dissimilar (negative label), and 1
+      means they are similar (positive label).
   """
 
   def fit(self, pairs, y):
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 096dc0ed..536bd28a 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -86,6 +86,11 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
+
+  classes_ : `list`
+      The possible labels of the pairs `SDML` can fit on. `classes_ = [-1, 1]`,
+      where -1 means points in a pair are dissimilar (negative label), and 1
+      means they are similar (positive label).
   """
 
   def fit(self, pairs, y):
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index 3e2b9113..096fbf37 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -1,5 +1,6 @@
 import pytest
 import unittest
+from sklearn.calibration import CalibratedClassifierCV
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.base import TransformerMixin
 from sklearn.pipeline import make_pipeline
@@ -19,7 +20,8 @@
 from sklearn.utils.testing import _get_args
 from test.test_utils import (metric_learners, ids_metric_learners,
                              mock_preprocessor, tuples_learners,
-                             ids_tuples_learners)
+                             ids_tuples_learners, pairs_learners,
+                             ids_pairs_learners)
 
 
 # Wrap the _Supervised methods with a deterministic wrapper for testing.
@@ -89,6 +91,31 @@ def test_mmc(self):
 # ---------------------- Test scikit-learn compatibility ----------------------
 
 
+@pytest.mark.parametrize('with_preprocessor',
+                         [True,
+                          # TODO: uncomment the below line as soon as
+                          # https://github.com/scikit-learn/scikit-learn/
+                          # issues/13077 is solved:
+                          # False,
+                          ])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_calibrated_classifier_CV(estimator, build_dataset,
+                                  with_preprocessor):
+  """Tests that metric-learn tuples estimators' work with scikit-learn's
+  CalibratedClassifierCV.
+  """
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  calibrated_clf = CalibratedClassifierCV(estimator)
+
+  # test fit and predict_proba
+  calibrated_clf.fit(input_data, labels)
+  calibrated_clf.predict_proba(input_data)
+
+
 @pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', tuples_learners,
                          ids=ids_tuples_learners)

From 3354fb1206f8e9a85051057ad6197048582f0c47 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 7 Feb 2019 13:23:57 +0100
Subject: [PATCH 05/41] Fix some tests and PEP8 errors

---
 metric_learn/base_metric.py          | 17 ++++++++++++-----
 test/test_quadruplets_classifiers.py | 26 +-------------------------
 2 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 079968bb..61582977 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -374,8 +374,10 @@ def score(self, pairs, y):
   def set_default_threshold(self, pairs, y):
     """Returns a threshold that is the mean between the similar metrics
     mean, and the dissimilar metrics mean"""
-    similar_threshold = np.mean(self.decision_function(pairs[y==1]))
-    dissimilar_threshold = np.mean(self.decision_function(pairs[y==1]))
+    similar_threshold = np.mean(self.decision_function(
+        pairs[(y == 1).ravel()]))
+    dissimilar_threshold = np.mean(self.decision_function(
+        pairs[(y == -1).ravel()]))
     self.threshold_ = np.mean([similar_threshold, dissimilar_threshold])
 
 
@@ -458,9 +460,14 @@ def score(self, quadruplets, y=None):
     score : float
       The quadruplets score.
     """
-    quadruplets = check_input(quadruplets, y, type_of_inputs='tuples',
-                              preprocessor=self.preprocessor_,
-                              estimator=self, tuple_size=self._tuple_size)
+    checked_input = check_input(quadruplets, y, type_of_inputs='tuples',
+                                preprocessor=self.preprocessor_,
+                                estimator=self, tuple_size=self._tuple_size)
+    # checked_input will be of the form `(checked_quadruplets, checked_y)` if
+    # `y` is not None, or just `checked_quadruplets` if `y` is None
+    quadruplets = checked_input if y is None else checked_input[0]
     if y is None:
       y = np.ones(quadruplets.shape[0])
+    else:
+      y = checked_input[1]
     return accuracy_score(y, self.predict(quadruplets))
diff --git a/test/test_quadruplets_classifiers.py b/test/test_quadruplets_classifiers.py
index b272a52d..56680476 100644
--- a/test/test_quadruplets_classifiers.py
+++ b/test/test_quadruplets_classifiers.py
@@ -25,35 +25,11 @@ def test_predict_only_one_or_minus_one(estimator, build_dataset,
   assert np.isin(predictions, [-1, 1]).all()
 
 
-@pytest.mark.parametrize('with_preprocessor', [True, False])
-@pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
-                         ids=ids_quadruplets_learners)
-def test_predict_monotonous(estimator, build_dataset,
-                            with_preprocessor):
-  """Test that there is a threshold distance separating points labeled as
-  similar and points labeled as dissimilar """
-  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
-  estimator = clone(estimator)
-  estimator.set_params(preprocessor=preprocessor)
-  set_random_state(estimator)
-  (quadruplets_train,
-   quadruplets_test, y_train, y_test) = train_test_split(input_data, labels)
-  estimator.fit(quadruplets_train, y_train)
-  distances = estimator.score_quadruplets(quadruplets_test)
-  predictions = estimator.predict(quadruplets_test)
-  min_dissimilar = np.min(distances[predictions == -1])
-  max_similar = np.max(distances[predictions == 1])
-  assert max_similar <= min_dissimilar
-  separator = np.mean([min_dissimilar, max_similar])
-  assert (predictions[distances > separator] == -1).all()
-  assert (predictions[distances < separator] == 1).all()
-
-
 @pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', quadruplets_learners,
                          ids=ids_quadruplets_learners)
 def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
-                                            with_preprocessor):
+                                              with_preprocessor):
   """Test that a NotFittedError is raised if someone tries to predict and
   the metric learner has not been fitted."""
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)

From 12cb5f1c2e618aa1884c6f0ae46b60ea62164820 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 19 Feb 2019 12:20:07 +0100
Subject: [PATCH 06/41] change the sign in decision function

---
 metric_learn/base_metric.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 61582977..1e087b53 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -342,7 +342,7 @@ def decision_function(self, pairs):
     pairs = check_input(pairs, type_of_inputs='tuples',
                         preprocessor=self.preprocessor_,
                         estimator=self, tuple_size=self._tuple_size)
-    return self.score_pairs(pairs)
+    return - self.score_pairs(pairs)
 
   def score(self, pairs, y):
     """Computes score of pairs similarity prediction.
@@ -429,8 +429,8 @@ def decision_function(self, quadruplets):
     decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
       Metric differences.
     """
-    return (self.score_pairs(quadruplets[:, :2]) -
-            self.score_pairs(quadruplets[:, 2:]))
+    return (self.score_pairs(quadruplets[:, 2:]) -
+            self.score_pairs(quadruplets[:, :2]))
 
   def score(self, quadruplets, y=None):
     """Computes score on input quadruplets

From dd8113e39e72cfa706bba43e30f305a3e8283121 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 19 Feb 2019 14:07:02 +0100
Subject: [PATCH 07/41] Add docstring for threshold_ and classes_ in the base
 _PairsClassifier class

---
 metric_learn/base_metric.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 1e087b53..da5b4598 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -295,6 +295,19 @@ def get_mahalanobis_matrix(self):
 
 
 class _PairsClassifierMixin(BaseMetricLearner):
+  """
+  Attributes
+  ----------
+  threshold_ : `float`
+      If the distance metric between two points is lower than this threshold,
+      points will be classified as similar, otherwise they will be
+      classified as dissimilar.
+
+  classes_ : `list`
+      The possible labels of the pairs `MMC` can fit on. `classes_ = [-1, 1]`,
+      where -1 means points in a pair are dissimilar (negative label), and 1
+      means they are similar (positive label).
+  """
 
   classes_ = [-1, 1]
   _tuple_size = 2  # number of points in a tuple, 2 for pairs

From 1c8cd290c71134409ab2641bf170b05a080febf1 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 19 Feb 2019 17:30:18 +0100
Subject: [PATCH 08/41] remove quadruplets from the test with scikit learn
 custom scorings

---
 test/test_sklearn_compat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index 096fbf37..e9f4b546 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -117,8 +117,8 @@ def test_calibrated_classifier_CV(estimator, build_dataset,
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
-@pytest.mark.parametrize('estimator, build_dataset', tuples_learners,
-                         ids=ids_tuples_learners)
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
 def test_various_scoring_on_tuples_learners(estimator, build_dataset,
                                             with_preprocessor):
   """Tests that metric-learn estimators' scoring returns something finite,

From d12729ab53372e03a30ffc0e4ef826d431f2422f Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 20 Feb 2019 11:14:55 +0100
Subject: [PATCH 09/41] Remove argument y in quadruplets learners and lsml

---
 metric_learn/base_metric.py          | 22 ++--------------------
 metric_learn/lsml.py                 | 26 ++++++++------------------
 test/test_quadruplets_classifiers.py |  2 +-
 3 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index da5b4598..40a460d2 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -445,7 +445,7 @@ def decision_function(self, quadruplets):
     return (self.score_pairs(quadruplets[:, 2:]) -
             self.score_pairs(quadruplets[:, :2]))
 
-  def score(self, quadruplets, y=None):
+  def score(self, quadruplets):
     """Computes score on input quadruplets
 
     Returns the accuracy score of the following classification task: a record
@@ -460,27 +460,9 @@ def score(self, quadruplets, y=None):
       points, or 2D array of indices of quadruplets if the metric learner
       uses a preprocessor.
 
-    y : array-like, shape=(n_constraints,) or `None`
-      Labels of constraints. y[i] should be 1 if
-      d(pairs[i, 0], X[i, 1]) is wanted to be larger than
-      d(X[i, 2], X[i, 3]), and -1 if it is wanted to be smaller. If None,
-      `y` will be set to `np.ones(quadruplets.shape[0])`, i.e. we want all
-      first two points to be closer than the last two points in each
-      quadruplet.
-
     Returns
     -------
     score : float
       The quadruplets score.
     """
-    checked_input = check_input(quadruplets, y, type_of_inputs='tuples',
-                                preprocessor=self.preprocessor_,
-                                estimator=self, tuple_size=self._tuple_size)
-    # checked_input will be of the form `(checked_quadruplets, checked_y)` if
-    # `y` is not None, or just `checked_quadruplets` if `y` is None
-    quadruplets = checked_input if y is None else checked_input[0]
-    if y is None:
-      y = np.ones(quadruplets.shape[0])
-    else:
-      y = checked_input[1]
-    return accuracy_score(y, self.predict(quadruplets))
+    return - np.mean(self.predict(quadruplets))
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
index b1f65c48..536719ba 100644
--- a/metric_learn/lsml.py
+++ b/metric_learn/lsml.py
@@ -45,16 +45,10 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False,
     self.verbose = verbose
     super(_BaseLSML, self).__init__(preprocessor)
 
-  def _fit(self, quadruplets, y=None, weights=None):
-    quadruplets = self._prepare_inputs(quadruplets, y,
+  def _fit(self, quadruplets, weights=None):
+    quadruplets = self._prepare_inputs(quadruplets,
                                        type_of_inputs='tuples')
-    if y is None:
-      y = np.ones(quadruplets.shape[0])
-    # we swap the quadruplets where the label is -1 since they are not in
-    # the right order
-    quadruplets_to_swap = quadruplets[y == -1]
-    quadruplets[y == -1] = np.column_stack([quadruplets_to_swap[:, 2:],
-                                            quadruplets_to_swap[:, :2]])
+
     # check to make sure that no two constrained vectors are identical
     vab = quadruplets[:, 0, :] - quadruplets[:, 1, :]
     vcd = quadruplets[:, 2, :] - quadruplets[:, 3, :]
@@ -150,7 +144,7 @@ class LSML(_BaseLSML, _QuadrupletsClassifierMixin):
       metric (See function `transformer_from_metric`.)
   """
 
-  def fit(self, quadruplets, y=None, weights=None):
+  def fit(self, quadruplets, weights=None):
     """Learn the LSML model.
 
     Parameters
@@ -158,14 +152,10 @@ def fit(self, quadruplets, y=None, weights=None):
     quadruplets : array-like, shape=(n_constraints, 4, n_features) or
                   (n_constraints, 4)
         3D array-like of quadruplets of points or 2D array of quadruplets of
-        indicators.
-    y : array-like, shape=(n_constraints,) or `None`
-        Labels of constraints. y[i] should be 1 if
-        d(pairs[i, 0], X[i, 1]) is wanted to be larger than
-        d(X[i, 2], X[i, 3]), and -1 if it is wanted to be smaller. If None,
-        `y` will be set to `np.ones(quadruplets.shape[0])`, i.e. we want to
-        put all first two points closer than the last two points in each
-        quadruplet.
+        indicators. In order to supervise the algorithm in the right way, we
+        should have the four samples ordered in a way such that:
+        d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i <
+        n_constraints.
     weights : (n_constraints,) array of floats, optional
         scale factor for each constraint
 
diff --git a/test/test_quadruplets_classifiers.py b/test/test_quadruplets_classifiers.py
index 56680476..ee6ed7eb 100644
--- a/test/test_quadruplets_classifiers.py
+++ b/test/test_quadruplets_classifiers.py
@@ -20,7 +20,7 @@ def test_predict_only_one_or_minus_one(estimator, build_dataset,
   set_random_state(estimator)
   (quadruplets_train,
    quadruplets_test, y_train, y_test) = train_test_split(input_data, labels)
-  estimator.fit(quadruplets_train, y_train)
+  estimator.fit(quadruplets_train)
   predictions = estimator.predict(quadruplets_test)
   assert np.isin(predictions, [-1, 1]).all()
 

From dc9e21d32d0c1ed3560052337b8a41e013e3a6f2 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 20 Feb 2019 11:32:45 +0100
Subject: [PATCH 10/41] FIX fix docstrings of decision functions

---
 metric_learn/base_metric.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 40a460d2..9e9d0e7e 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -335,10 +335,12 @@ def predict(self, pairs):
     return - 2 * (self.decision_function(pairs) > self.threshold_) + 1
 
   def decision_function(self, pairs):
-    """Returns the learned metric between input pairs.
+    """Returns the decision function used to classify the pairs.
 
-    Returns the learned metric value between samples in every pair. It should
-    ideally be low for similar samples and high for dissimilar samples.
+    Returns the opposite of the learned metric value between samples in every
+    pair. Hence it should ideally be low for dissimilar samples and high for
+    similar samples. This is the decision function that is used to classify
+    pairs as similar (+1), or dissimilar (-1).
 
     Parameters
     ----------
@@ -350,7 +352,7 @@ def decision_function(self, pairs):
     Returns
     -------
     y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
-      The predicted learned metric value between samples in every pair.
+      The predicted decision function value for each pair.
     """
     pairs = check_input(pairs, type_of_inputs='tuples',
                         preprocessor=self.preprocessor_,
@@ -426,8 +428,12 @@ def predict(self, quadruplets):
   def decision_function(self, quadruplets):
     """Predicts differences between sample distances in input quadruplets.
 
-    For each quadruplet of samples, computes the difference between the learned
-    metric of the first pair minus the learned metric of the second pair.
+    For each quadruplet in the samples, computes the difference between the
+    learned metric of the second pair minus the learned metric of the first
+    pair. The higher it is, the more probable it is that the pairs in the
+    quadruplet are presented in the right order, i.e. that the label of the
+    quadruplet is 1. The lower it is, the more probable it is that the label of
+    the quadruplet is -1.
 
     Parameters
     ----------

From 402729fbc3cd24eadb002d3880ad7ff3af71bb1b Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 20 Feb 2019 15:18:11 +0100
Subject: [PATCH 11/41] FIX the threshold by taking the opposite (to be adapted
 to the decision function)

---
 metric_learn/base_metric.py | 12 ++++++------
 metric_learn/itml.py        |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 9e9d0e7e..b7927f38 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -332,7 +332,7 @@ def predict(self, pairs):
       The predicted learned metric value between samples in every pair.
     """
     check_is_fitted(self, ['threshold_', 'transformer_'])
-    return - 2 * (self.decision_function(pairs) > self.threshold_) + 1
+    return 2 * (self.decision_function(pairs) > self.threshold_) - 1
 
   def decision_function(self, pairs):
     """Returns the decision function used to classify the pairs.
@@ -387,13 +387,13 @@ def score(self, pairs, y):
     return roc_auc_score(y, self.decision_function(pairs))
 
   def set_default_threshold(self, pairs, y):
-    """Returns a threshold that is the mean between the similar metrics
-    mean, and the dissimilar metrics mean"""
-    similar_threshold = np.mean(self.decision_function(
+    """Returns a threshold that is the opposite of the mean between the similar
+    metrics mean and the dissimilar metrics mean"""
+    similar_threshold = np.mean(self.score_pairs(
         pairs[(y == 1).ravel()]))
-    dissimilar_threshold = np.mean(self.decision_function(
+    dissimilar_threshold = np.mean(self.score_pairs(
         pairs[(y == -1).ravel()]))
-    self.threshold_ = np.mean([similar_threshold, dissimilar_threshold])
+    self.threshold_ = - np.mean([similar_threshold, dissimilar_threshold])
 
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index aa75463e..a40476c7 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -187,7 +187,7 @@ def fit(self, pairs, y, bounds=None):
         Returns the instance.
     """
     self._fit(pairs, y, bounds=bounds)
-    self.threshold_ = np.mean(self.bounds_)
+    self.threshold_ = - np.mean(self.bounds_)
     return self
 
 

From aaac3deb37e6dd0d5b3d4e67443ac6e318dcc874 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Feb 2019 09:38:49 +0100
Subject: [PATCH 12/41] Fix tests to have no y for quadruplets' estimator fit

---
 test/test_mahalanobis_mixin.py |  38 +++++-----
 test/test_pairs_classifiers.py |   3 +-
 test/test_sklearn_compat.py    | 131 +++++++++++++++++++--------------
 test/test_utils.py             |  62 ++++++++++------
 4 files changed, 140 insertions(+), 94 deletions(-)

diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py
index 1e555e73..a85d9e8f 100644
--- a/test/test_mahalanobis_mixin.py
+++ b/test/test_mahalanobis_mixin.py
@@ -11,7 +11,8 @@
 
 from metric_learn._util import make_context
 
-from test.test_utils import ids_metric_learners, metric_learners
+from test.test_utils import (ids_metric_learners, metric_learners,
+                             make_args_inc_quadruplets)
 
 RNG = check_random_state(0)
 
@@ -25,7 +26,7 @@ def test_score_pairs_pairwise(estimator, build_dataset):
   X = X[:n_samples]
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
 
   pairwise = model.score_pairs(np.array(list(product(X, X))))\
       .reshape(n_samples, n_samples)
@@ -49,7 +50,7 @@ def test_score_pairs_toy_example(estimator, build_dataset):
     X = X[:n_samples]
     model = clone(estimator)
     set_random_state(model)
-    model.fit(input_data, labels)
+    model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
     pairs = np.stack([X[:10], X[10:20]], axis=1)
     embedded_pairs = pairs.dot(model.transformer_.T)
     distances = np.sqrt(np.sum((embedded_pairs[:, 1] -
@@ -65,7 +66,7 @@ def test_score_pairs_finite(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   pairs = np.array(list(product(X, X)))
   assert np.isfinite(model.score_pairs(pairs)).all()
 
@@ -79,7 +80,7 @@ def test_score_pairs_dim(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   tuples = np.array(list(product(X, X)))
   assert model.score_pairs(tuples).shape == (tuples.shape[0],)
   context = make_context(estimator)
@@ -110,7 +111,7 @@ def test_embed_toy_example(estimator, build_dataset):
     X = X[:n_samples]
     model = clone(estimator)
     set_random_state(model)
-    model.fit(input_data, labels)
+    model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
     embedded_points = X.dot(model.transformer_.T)
     assert_array_almost_equal(model.transform(X), embedded_points)
 
@@ -122,7 +123,7 @@ def test_embed_dim(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   assert model.transform(X).shape == X.shape
 
   # assert that ValueError is thrown if input shape is 1D
@@ -135,8 +136,11 @@ def test_embed_dim(estimator, build_dataset):
   assert str(raised_error.value) == err_msg
   # we test that the shape is also OK when doing dimensionality reduction
   if type(model).__name__ in {'LFDA', 'MLKR', 'NCA', 'RCA'}:
+    # TODO:
+    #  avoid this enumeration and rather test if hasattr n_components
+    #  as soon as we have made the arguments names as such (issue #167)
     model.set_params(num_dims=2)
-    model.fit(input_data, labels)
+    model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
     assert model.transform(X).shape == (X.shape[0], 2)
     # assert that ValueError is thrown if input shape is 1D
     with pytest.raises(ValueError) as raised_error:
@@ -151,7 +155,7 @@ def test_embed_finite(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   assert np.isfinite(model.transform(X)).all()
 
 
@@ -162,7 +166,7 @@ def test_embed_is_linear(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   assert_array_almost_equal(model.transform(X[:10] + X[10:20]),
                             model.transform(X[:10]) +
                             model.transform(X[10:20]))
@@ -181,7 +185,7 @@ def test_get_metric_equivalent_to_explicit_mahalanobis(estimator,
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   metric = model.get_metric()
   n_features = X.shape[1]
   a, b = (rng.randn(n_features), rng.randn(n_features))
@@ -200,7 +204,7 @@ def test_get_metric_is_pseudo_metric(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   metric = model.get_metric()
 
   n_features = X.shape[1]
@@ -226,7 +230,7 @@ def test_metric_raises_deprecation_warning(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
 
   with pytest.warns(DeprecationWarning) as raised_warning:
     model.metric()
@@ -243,7 +247,7 @@ def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   clustering = DBSCAN(metric=model.get_metric())
   clustering.fit(X)
 
@@ -256,7 +260,7 @@ def test_get_squared_metric(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   metric = model.get_metric()
 
   n_features = X.shape[1]
@@ -276,10 +280,10 @@ def test_transformer_is_2D(estimator, build_dataset):
   model = clone(estimator)
   set_random_state(model)
   # test that it works for X.shape[1] features
-  model.fit(input_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   assert model.transformer_.shape == (X.shape[1], X.shape[1])
 
   # test that it works for 1 feature
   trunc_data = input_data[..., :1]
-  model.fit(trunc_data, labels)
+  model.fit(*make_args_inc_quadruplets(estimator, trunc_data, labels))
   assert model.transformer_.shape == (1, 1)  # the transformer must be 2D
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index 34d107ea..b67e7268 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -22,7 +22,8 @@ def test_predict_only_one_or_minus_one(estimator, build_dataset,
                                                               labels)
   estimator.fit(pairs_train, y_train)
   predictions = estimator.predict(pairs_test)
-  assert np.isin(predictions, [-1, 1]).all()
+  not_valid = [e for e in predictions if e not in [-1, 1]]
+  assert len(not_valid) == 0
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index e9f4b546..e14e2cf9 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -21,7 +21,8 @@
 from test.test_utils import (metric_learners, ids_metric_learners,
                              mock_preprocessor, tuples_learners,
                              ids_tuples_learners, pairs_learners,
-                             ids_pairs_learners)
+                             ids_pairs_learners, make_args_inc_quadruplets,
+                             quadruplets_learners)
 
 
 # Wrap the _Supervised methods with a deterministic wrapper for testing.
@@ -121,10 +122,11 @@ def test_calibrated_classifier_CV(estimator, build_dataset,
                          ids=ids_pairs_learners)
 def test_various_scoring_on_tuples_learners(estimator, build_dataset,
                                             with_preprocessor):
-  """Tests that metric-learn estimators' scoring returns something finite,
+  """Tests that scikit-learn's scoring returns something finite,
   for other scoring than default scoring. (List of scikit-learn's scores can be
   found in sklearn.metrics.scorer). For each type of output (predict,
   predict_proba, decision_function), we test a bunch of scores.
+  We only test on pairs learners because quadruplets don't have a y argument.
   """
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
@@ -147,11 +149,11 @@ def test_various_scoring_on_tuples_learners(estimator, build_dataset,
 
 
 def check_score_is_finite(scoring, estimator, input_data, labels):
-    estimator = clone(estimator)
-    assert np.isfinite(cross_val_score(estimator, input_data, labels,
-                                       scoring=scoring)).all()
-    estimator.fit(input_data, labels)
-    assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
+  estimator = clone(estimator)
+  assert np.isfinite(cross_val_score(estimator, input_data, labels,
+                                     scoring=scoring)).all()
+  estimator.fit(input_data, labels)
+  assert np.isfinite(get_scorer(scoring)(estimator, input_data, labels))
 
 
 @pytest.mark.parametrize('estimator, build_dataset', tuples_learners,
@@ -163,9 +165,15 @@ def test_cross_validation_is_finite(estimator, build_dataset):
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)
   set_random_state(estimator)
-  assert np.isfinite(cross_val_score(estimator, input_data, labels)).all()
+  assert np.isfinite(cross_val_score(estimator,
+                                     *make_args_inc_quadruplets(estimator,
+                                                                input_data,
+                                                                labels))).all()
   assert np.isfinite(cross_val_predict(estimator,
-                                       input_data, labels)).all()
+                                       *make_args_inc_quadruplets(estimator,
+                                                                  input_data,
+                                                                  labels)
+                                       )).all()
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
@@ -196,23 +204,25 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset,
       train_mask = np.ones(input_data.shape[0], bool)
       train_mask[test_slice] = False
       y_train, y_test = labels[train_mask], labels[test_slice]
-      estimator.fit(input_data[train_mask], y_train)
+      estimator.fit(*make_args_inc_quadruplets(estimator,
+                                               input_data[train_mask],
+                                               y_train))
       if hasattr(estimator, "score"):
-        scores.append(estimator.score(input_data[test_slice], y_test))
+        scores.append(estimator.score(*make_args_inc_quadruplets(estimator,
+          input_data[test_slice], y_test)))
       if hasattr(estimator, "predict"):
         predictions[test_slice] = estimator.predict(input_data[test_slice])
     if hasattr(estimator, "score"):
-      assert all(scores == cross_val_score(estimator, input_data, labels,
-                                           cv=kfold))
+      assert all(scores == cross_val_score(estimator,
+        *make_args_inc_quadruplets(estimator, input_data, labels), cv=kfold))
     if hasattr(estimator, "predict"):
-      assert all(predictions == cross_val_predict(estimator, input_data,
-                                                  labels,
-                                                  cv=kfold))
+      assert all(predictions == cross_val_predict(estimator,
+        *make_args_inc_quadruplets(estimator, input_data, labels), cv=kfold))
 
 
 def check_score(estimator, tuples, y):
   if hasattr(estimator, "score"):
-    score = estimator.score(tuples, y)
+    score = estimator.score(*make_args_inc_quadruplets(estimator, tuples, y))
     assert np.isfinite(score)
 
 
@@ -236,7 +246,7 @@ def test_simple_estimator(estimator, build_dataset, with_preprocessor):
     estimator.set_params(preprocessor=preprocessor)
     set_random_state(estimator)
 
-    estimator.fit(tuples_train, y_train)
+    estimator.fit(*make_args_inc_quadruplets(estimator, tuples_train, y_train))
     check_score(estimator, tuples_test, y_test)
     check_predict(estimator, tuples_test)
 
@@ -283,7 +293,9 @@ def test_estimators_fit_returns_self(estimator, build_dataset,
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)
-  assert estimator.fit(input_data, labels) is estimator
+  assert estimator.fit(*make_args_inc_quadruplets(estimator,
+                                                  input_data,
+                                                  labels)) is estimator
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
@@ -293,42 +305,53 @@ def test_pipeline_consistency(estimator, build_dataset,
                               with_preprocessor):
   # Adapted from scikit learn
   # check that make_pipeline(est) gives same score as est
-  input_data, y, preprocessor, _ = build_dataset(with_preprocessor)
-
-  def make_random_state(estimator, in_pipeline):
-    rs = {}
-    name_estimator = estimator.__class__.__name__
-    if name_estimator[-11:] == '_Supervised':
-      name_param = 'random_state'
-      if in_pipeline:
-          name_param = name_estimator.lower() + '__' + name_param
-      rs[name_param] = check_random_state(0)
-    return rs
+  # we do this test on all except quadruplets (since they don't have a y
+  # in fit):
+  if estimator.__class__.__name__ not in [e.__class__.__name__
+                                          for (e, _) in
+                                          quadruplets_learners]:
+    input_data, y, preprocessor, _ = build_dataset(with_preprocessor)
+
+    def make_random_state(estimator, in_pipeline):
+      rs = {}
+      name_estimator = estimator.__class__.__name__
+      if name_estimator[-11:] == '_Supervised':
+        name_param = 'random_state'
+        if in_pipeline:
+            name_param = name_estimator.lower() + '__' + name_param
+        rs[name_param] = check_random_state(0)
+      return rs
 
-  estimator = clone(estimator)
-  estimator.set_params(preprocessor=preprocessor)
-  pipeline = make_pipeline(estimator)
-  estimator.fit(input_data, y, **make_random_state(estimator, False))
-  pipeline.fit(input_data, y, **make_random_state(estimator, True))
-
-  if hasattr(estimator, 'score'):
-    result = estimator.score(input_data, y)
-    result_pipe = pipeline.score(input_data, y)
-    assert_allclose_dense_sparse(result, result_pipe)
-
-  if hasattr(estimator, 'predict'):
-    result = estimator.predict(input_data)
-    result_pipe = pipeline.predict(input_data)
-    assert_allclose_dense_sparse(result, result_pipe)
-
-  if issubclass(estimator.__class__, TransformerMixin):
-    if hasattr(estimator, 'transform'):
-      result = estimator.transform(input_data)
-      result_pipe = pipeline.transform(input_data)
+    estimator = clone(estimator)
+    estimator.set_params(preprocessor=preprocessor)
+    pipeline = make_pipeline(estimator)
+    estimator.fit(*make_args_inc_quadruplets(estimator, input_data, y),
+                  **make_random_state(estimator, False))
+    pipeline.fit(*make_args_inc_quadruplets(estimator, input_data, y),
+                 **make_random_state(estimator, True))
+
+    if hasattr(estimator, 'score'):
+      result = estimator.score(*make_args_inc_quadruplets(estimator,
+                                                          input_data,
+                                                          y))
+      result_pipe = pipeline.score(*make_args_inc_quadruplets(estimator,
+                                                              input_data,
+                                                              y))
       assert_allclose_dense_sparse(result, result_pipe)
 
+    if hasattr(estimator, 'predict'):
+      result = estimator.predict(input_data)
+      result_pipe = pipeline.predict(input_data)
+      assert_allclose_dense_sparse(result, result_pipe)
+
+    if issubclass(estimator.__class__, TransformerMixin):
+      if hasattr(estimator, 'transform'):
+        result = estimator.transform(input_data)
+        result_pipe = pipeline.transform(input_data)
+        assert_allclose_dense_sparse(result, result_pipe)
 
-@pytest.mark.parametrize('with_preprocessor',[True, False])
+
+@pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
 def test_dict_unchanged(estimator, build_dataset, with_preprocessor):
@@ -339,7 +362,7 @@ def test_dict_unchanged(estimator, build_dataset, with_preprocessor):
   estimator.set_params(preprocessor=preprocessor)
   if hasattr(estimator, "num_dims"):
     estimator.num_dims = 1
-  estimator.fit(input_data, labels)
+  estimator.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
 
   def check_dict():
     assert estimator.__dict__ == dict_before, (
@@ -356,7 +379,7 @@ def check_dict():
     check_dict()
 
 
-@pytest.mark.parametrize('with_preprocessor',[True, False])
+@pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', metric_learners,
                          ids=ids_metric_learners)
 def test_dont_overwrite_parameters(estimator, build_dataset,
@@ -370,7 +393,7 @@ def test_dont_overwrite_parameters(estimator, build_dataset,
     estimator.num_dims = 1
   dict_before_fit = estimator.__dict__.copy()
 
-  estimator.fit(input_data, labels)
+  estimator.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
   dict_after_fit = estimator.__dict__
 
   public_keys_after_fit = [key for key in dict_after_fit.keys()
diff --git a/test/test_utils.py b/test/test_utils.py
index 5e640dbc..38226fef 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -107,23 +107,23 @@ def build_quadruplets(with_preprocessor=False):
                   (SDML(), build_pairs),
                   ]
 ids_pairs_learners = list(map(lambda x: x.__class__.__name__,
-                                [learner for (learner, _) in
-                                 pairs_learners]))
-
-classifiers =   [(Covariance(), build_classification),
-                 (LFDA(), build_classification),
-                 (LMNN(), build_classification),
-                 (NCA(), build_classification),
-                 (RCA(), build_classification),
-                 (ITML_Supervised(max_iter=5), build_classification),
-                 (LSML_Supervised(), build_classification),
-                 (MMC_Supervised(max_iter=5), build_classification),
-                 (RCA_Supervised(num_chunks=10), build_classification),
-                 (SDML_Supervised(), build_classification)
-                 ]
+                              [learner for (learner, _) in
+                               pairs_learners]))
+
+classifiers = [(Covariance(), build_classification),
+               (LFDA(), build_classification),
+               (LMNN(), build_classification),
+               (NCA(), build_classification),
+               (RCA(), build_classification),
+               (ITML_Supervised(max_iter=5), build_classification),
+               (LSML_Supervised(), build_classification),
+               (MMC_Supervised(max_iter=5), build_classification),
+               (RCA_Supervised(num_chunks=10), build_classification),
+               (SDML_Supervised(), build_classification)
+               ]
 ids_classifiers = list(map(lambda x: x.__class__.__name__,
-                                [learner for (learner, _) in
-                                 classifiers]))
+                           [learner for (learner, _) in
+                            classifiers]))
 
 regressors = [(MLKR(), build_regression)]
 ids_regressors = list(map(lambda x: x.__class__.__name__,
@@ -142,6 +142,18 @@ def build_quadruplets(with_preprocessor=False):
 ids_metric_learners = ids_tuples_learners + ids_supervised_learners
 
 
+def make_args_inc_quadruplets(estimator, X, y):
+  """Quadruplets learners have no y in fit, but to write test for all
+  estimators, it is convenient to have this function, that will return X and y
+  if the estimator needs a y to fit on, and just X otherwise."""
+  if estimator.__class__.__name__ in [e.__class__.__name__
+                                      for (e, _) in
+                                      quadruplets_learners]:
+    return (X,)
+  else:
+    return (X, y)
+
+
 def mock_preprocessor(indices):
   """A preprocessor for testing purposes that returns an all ones 3D array
   """
@@ -839,8 +851,8 @@ class MockMetricLearner(MahalanobisMixin):
                           "or a callable.".format(type(preprocessor)))
 
 
-@pytest.mark.parametrize('estimator', [ITML(), LSML(), MMC(), SDML()],
-                         ids=['ITML', 'LSML', 'MMC', 'SDML'])
+@pytest.mark.parametrize('estimator', [e for (e, _) in tuples_learners],
+                         ids=ids_tuples_learners)
 def test_error_message_tuple_size(estimator):
   """Tests that if a tuples learner is not given the good number of points
   per tuple, it throws an error message"""
@@ -850,7 +862,7 @@ def test_error_message_tuple_size(estimator):
                             [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]])
   y = [1, 1]
   with pytest.raises(ValueError) as raised_err:
-    estimator.fit(invalid_pairs, y)
+    estimator.fit(*make_args_inc_quadruplets(estimator, invalid_pairs, y))
   expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 3 "
                   "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n"
                   .format(estimator._tuple_size, make_context(estimator),
@@ -935,19 +947,25 @@ def make_random_state(estimator):
   estimator_with_preprocessor = clone(estimator)
   set_random_state(estimator_with_preprocessor)
   estimator_with_preprocessor.set_params(preprocessor=X)
-  estimator_with_preprocessor.fit(indices_train, y_train,
+  estimator_with_preprocessor.fit(*make_args_inc_quadruplets(estimator,
+                                                             indices_train,
+                                                             y_train),
                                   **make_random_state(estimator))
 
   estimator_without_preprocessor = clone(estimator)
   set_random_state(estimator_without_preprocessor)
   estimator_without_preprocessor.set_params(preprocessor=None)
-  estimator_without_preprocessor.fit(formed_train, y_train,
+  estimator_without_preprocessor.fit(*make_args_inc_quadruplets(estimator,
+                                                                formed_train,
+                                                                y_train),
                                      **make_random_state(estimator))
 
   estimator_with_prep_formed = clone(estimator)
   set_random_state(estimator_with_prep_formed)
   estimator_with_prep_formed.set_params(preprocessor=X)
-  estimator_with_prep_formed.fit(indices_train, y_train,
+  estimator_with_prep_formed.fit(*make_args_inc_quadruplets(estimator,
+                                                            indices_train,
+                                                            y_train),
                                  **make_random_state(estimator))
 
   # test prediction methods

From e5b1e47b3a35d5718a11fb2da4670dd01f3a1a10 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Feb 2019 10:06:25 +0100
Subject: [PATCH 13/41] Remove isin to be compatible with old numpy versions

---
 test/test_quadruplets_classifiers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_quadruplets_classifiers.py b/test/test_quadruplets_classifiers.py
index ee6ed7eb..2bf36b3f 100644
--- a/test/test_quadruplets_classifiers.py
+++ b/test/test_quadruplets_classifiers.py
@@ -22,7 +22,8 @@ def test_predict_only_one_or_minus_one(estimator, build_dataset,
    quadruplets_test, y_train, y_test) = train_test_split(input_data, labels)
   estimator.fit(quadruplets_train)
   predictions = estimator.predict(quadruplets_test)
-  assert np.isin(predictions, [-1, 1]).all()
+  not_valid = [e for e in predictions if e not in [-1, 1]]
+  assert len(not_valid) == 0
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])

From a0cb3cae896a07b5e73d566a181ed01abe89ed7e Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Feb 2019 10:49:37 +0100
Subject: [PATCH 14/41] Fix threshold so that it has a positive value and add
 small test

---
 metric_learn/base_metric.py    |  7 ++++---
 metric_learn/mmc.py            |  2 +-
 metric_learn/sdml.py           |  2 +-
 test/test_pairs_classifiers.py | 23 ++++++++++++++++++++++-
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index b7927f38..80fc7a7f 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -138,6 +138,7 @@ def get_metric(self):
       use the metric learner's preprocessor, and works on concatenated arrays.
     """
 
+
 class MetricTransformer(six.with_metaclass(ABCMeta)):
 
   @abstractmethod
@@ -332,7 +333,7 @@ def predict(self, pairs):
       The predicted learned metric value between samples in every pair.
     """
     check_is_fitted(self, ['threshold_', 'transformer_'])
-    return 2 * (self.decision_function(pairs) > self.threshold_) - 1
+    return 2 * (self.decision_function(pairs) > - self.threshold_) - 1
 
   def decision_function(self, pairs):
     """Returns the decision function used to classify the pairs.
@@ -386,14 +387,14 @@ def score(self, pairs, y):
     """
     return roc_auc_score(y, self.decision_function(pairs))
 
-  def set_default_threshold(self, pairs, y):
+  def _set_default_threshold(self, pairs, y):
     """Returns a threshold that is the opposite of the mean between the similar
     metrics mean and the dissimilar metrics mean"""
     similar_threshold = np.mean(self.score_pairs(
         pairs[(y == 1).ravel()]))
     dissimilar_threshold = np.mean(self.score_pairs(
         pairs[(y == -1).ravel()]))
-    self.threshold_ = - np.mean([similar_threshold, dissimilar_threshold])
+    self.threshold_ = np.mean([similar_threshold, dissimilar_threshold])
 
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 138b1d71..3892a969 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -390,7 +390,7 @@ def fit(self, pairs, y):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.set_default_threshold(pairs, y)
+    self._set_default_threshold(pairs, y)
     return self
 
 
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 536bd28a..359e4fe1 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -112,7 +112,7 @@ def fit(self, pairs, y):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.set_default_threshold(pairs, y)
+    self._set_default_threshold(pairs, y)
     return self
 
 
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index b67e7268..3ff47c18 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -1,4 +1,5 @@
 import pytest
+from metric_learn.base_metric import _PairsClassifierMixin, MahalanobisMixin
 from sklearn.exceptions import NotFittedError
 from sklearn.model_selection import train_test_split
 
@@ -54,7 +55,7 @@ def test_predict_monotonous(estimator, build_dataset,
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)
 def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
-                                            with_preprocessor):
+                                              with_preprocessor):
   """Test that a NotFittedError is raised if someone tries to predict and
   the metric learner has not been fitted."""
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
@@ -64,3 +65,23 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
   with pytest.raises(NotFittedError):
     estimator.predict(input_data)
 
+
+class IdentityPairsClassifier(MahalanobisMixin, _PairsClassifierMixin):
+  """A simple pairs classifier for testing purposes, that will just have
+  identity as transformer_.
+  """
+  def fit(self, pairs, y):
+    pairs, y = self._prepare_inputs(pairs, y,
+                                    type_of_inputs='tuples')
+    self.transformer_ = np.atleast_2d(np.identity(pairs.shape[2]))
+    return self
+
+
+def test_set_default_threshold_toy_example():
+  # test that the default threshold has the right value on a toy example
+  identity_pairs_classifier = IdentityPairsClassifier()
+  pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]])
+  y = np.array([1, 1, -1, -1])
+  identity_pairs_classifier.fit(pairs, y)
+  identity_pairs_classifier._set_default_threshold(pairs, y)
+  assert identity_pairs_classifier.threshold_ == 2.5

From 8d5fc501fc40daa53fb2ac83b55b65b27455d5a6 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Feb 2019 11:01:18 +0100
Subject: [PATCH 15/41] Fix threshold for itml

---
 metric_learn/itml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index a40476c7..aa75463e 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -187,7 +187,7 @@ def fit(self, pairs, y, bounds=None):
         Returns the instance.
     """
     self._fit(pairs, y, bounds=bounds)
-    self.threshold_ = - np.mean(self.bounds_)
+    self.threshold_ = np.mean(self.bounds_)
     return self
 
 

From 0f14b251cd18d622ebff98ac10a85a49c0124528 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 4 Mar 2019 16:23:10 +0100
Subject: [PATCH 16/41] FEAT: Add calibrate_threshold and tests

---
 metric_learn/base_metric.py    | 151 ++++++++++++++++++++-
 test/test_pairs_classifiers.py | 235 ++++++++++++++++++++++++++++++++-
 2 files changed, 381 insertions(+), 5 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 80fc7a7f..60c6859f 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -1,8 +1,7 @@
-from numpy.linalg import cholesky
-from scipy.spatial.distance import euclidean
 from sklearn.base import BaseEstimator
+from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.validation import _is_arraylike, check_is_fitted
-from sklearn.metrics import roc_auc_score, accuracy_score
+from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve
 import numpy as np
 from abc import ABCMeta, abstractmethod
 import six
@@ -333,7 +332,7 @@ def predict(self, pairs):
       The predicted learned metric value between samples in every pair.
     """
     check_is_fitted(self, ['threshold_', 'transformer_'])
-    return 2 * (self.decision_function(pairs) > - self.threshold_) - 1
+    return 2 * (self.decision_function(pairs) >= - self.threshold_) - 1
 
   def decision_function(self, pairs):
     """Returns the decision function used to classify the pairs.
@@ -396,6 +395,150 @@ def _set_default_threshold(self, pairs, y):
         pairs[(y == -1).ravel()]))
     self.threshold_ = np.mean([similar_threshold, dissimilar_threshold])
 
+  def set_threshold(self, threshold):
+    """Sets the threshold of the metric learner to the given value `threshold
+
+    Parameters
+    ----------
+    threshold : float
+      The threshold value we want to set. It's a distance metric with
+      respect to which the predicted distance metric for test pairs will be
+      compared to. If they are superior to the threshold they will be
+      classified as similar (+1), and dissimilar (-1) if not.
+
+    Returns
+    -------
+    self : `_PairsClassifier`
+      The pairs classifier with the new threshold set.
+    """
+    self.threshold_ = threshold
+    return self
+
+  def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
+                          threshold=None, beta=None):
+    """Decision threshold calibration for binary classification
+
+    Method that calibrates the decision threshold (cutoff point) of the metric
+    learner. This threshold will then be used when calling the method
+    `predict`. The methods for picking cutoff points make use of traditional
+    binary classification evaluation statistics such as the true positive and
+    true negative rates and F-scores. The threshold will be found to maximize
+    the chosen score on the validation set `(pairs_valid, y_valid)`.
+
+    Parameters
+    ----------
+    strategy : str, optional (default='roc')
+      The strategy to use for choosing the cutoff point
+
+      'accuracy'
+          selects a decision threshold that maximizes the accuracy
+      'f_beta'
+          selects a decision threshold that maximizes the f_beta score
+      'max_tpr'
+          selects the point that yields the highest true positive rate with
+          true negative rate at least equal to the value of the parameter
+          threshold
+      'max_tnr'
+          selects the point that yields the highest true negative rate with
+          true positive rate at least equal to the value of the parameter
+          threshold
+
+    beta : float in [0, 1], optional (default=None)
+      beta value to be used in case strategy == 'f_beta'
+
+    threshold : float in [0, 1] or None, (default=None)
+      In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
+      to specify the threshold for the true negative rate or true positive
+      rate respectively that needs to be achieved
+
+    pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features)
+      The validation set of pairs to use to set the threshold.
+
+    y_valid : array-like, shape=(n_pairs_valid,)
+      The labels of the pairs of the validation set to use to set the
+      threshold.
+
+    References
+    ----------
+    .. [1] Receiver-operating characteristic (ROC) plots: a fundamental
+           evaluation tool in clinical medicine, MH Zweig, G Campbell -
+           Clinical chemistry, 1993
+
+    .. [2] most of the code of this function is from scikit-learn's PR #10117
+
+    See Also
+    --------
+    sklearn.calibration : scikit-learn's module for calibrating classifiers
+    """
+
+    if strategy not in ('accuracy', 'f_beta', 'max_tpr',
+                        'max_tnr'):
+      raise ValueError('Strategy can either be "accuracy", "f_beta" or '
+                       '"max_tpr" or "max_tnr". Got "{}" instead.'
+                       .format(strategy))
+
+    if strategy == 'max_tpr' or strategy == 'max_tnr':
+      if (threshold is None or not isinstance(threshold, (int, float)) or
+          not threshold >= 0 or not threshold <= 1):
+        raise ValueError('Parameter threshold must be a number in'
+                         '[0, 1]. '
+                         'Got {} instead.'.format(threshold))
+
+    if strategy == 'f_beta':
+      if beta is None or not isinstance(beta, (int, float)):
+        raise ValueError('Parameter beta must be a real number. '
+                         'Got {} instead.'.format(type(beta)))
+
+    pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid,
+                                                type_of_inputs='tuples')
+
+    n_samples = pairs_valid.shape[0]
+    if strategy == 'accuracy':
+      scores = self.decision_function(pairs_valid)
+      scores_sorted_idces = np.argsort(scores)[::-1]
+      scores_sorted = scores[scores_sorted_idces]
+      # true labels ordered by decision_function value: (higher first)
+      y_ordered = y_valid[scores_sorted_idces]
+      # finds the threshold that maximizes the accuracy:
+      cum_tp = stable_cumsum(y_ordered == 1)  # cumulative number of true
+      # positives
+      cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1)
+      cum_tn = np.concatenate([[0], cum_tn_inverted[:-1]])[::-1]
+      cum_accuracy = (cum_tp + cum_tn) / n_samples
+      max_i = np.argmax(cum_accuracy)
+      # note: we want a positive threshold (distance), so we take - threshold
+      self.threshold_ = - scores_sorted[max_i]
+      return self
+
+    if strategy == 'f_beta':
+      precision, recall, thresholds = precision_recall_curve(
+          y_valid, self.decision_function(pairs_valid), pos_label=1)
+      with np.errstate(divide='ignore', invalid='ignore'):
+        f_beta = ((1 + beta**2) * (precision * recall) /
+                  (beta**2 * precision + recall))
+      f_beta[np.isnan(f_beta)] = 0.
+      imax = np.argmax(f_beta)
+      self.threshold_ = - thresholds[imax]
+      return self
+
+    fpr, tpr, thresholds = roc_curve(y_valid,
+                                     self.decision_function(pairs_valid),
+                                     pos_label=1)
+    fpr, tpr, thresholds = fpr, tpr, thresholds
+
+    if strategy == 'max_tpr':
+      indices = np.where(1 - fpr >= threshold)[0]
+      max_tpr_index = np.argmax(tpr[indices])
+      # note: we want a positive threshold (distance), so we take - threshold
+      self.threshold_ = - thresholds[indices[max_tpr_index]]
+
+    if strategy == 'max_tnr':
+      indices = np.where(tpr >= threshold)[0]
+      max_tnr_index = np.argmax(1 - fpr[indices])
+      # note: we want a positive threshold (distance), so we take - threshold
+      self.threshold_ = - thresholds[indices[max_tnr_index]]
+    return self
+
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
 
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index 3ff47c18..adeeb3cb 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -1,12 +1,17 @@
+from functools import partial
+
 import pytest
 from metric_learn.base_metric import _PairsClassifierMixin, MahalanobisMixin
 from sklearn.exceptions import NotFittedError
+from sklearn.metrics import (f1_score, accuracy_score, fbeta_score,
+                             precision_score)
 from sklearn.model_selection import train_test_split
 
 from test.test_utils import pairs_learners, ids_pairs_learners
 from sklearn.utils.testing import set_random_state
 from sklearn import clone
 import numpy as np
+from itertools import product
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
@@ -66,17 +71,54 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
     estimator.predict(input_data)
 
 
+@pytest.mark.parametrize('kwargs',
+                         [{'strategy': 'accuracy'},
+                          *[{'strategy': strategy, 'threshold': threshold}
+                             for (strategy, threshold) in product(
+                            ['max_tpr', 'max_tnr'],
+                            [0., 0.2, 0.8, 1.])],
+                          *[{'strategy': 'f_beta', 'beta': beta}
+                            for beta in [0., 0.1, 0.2, 1., 5.]]
+                          ])
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_threshold_different_scores_is_finite(estimator, build_dataset,
+                                              with_preprocessor, kwargs):
+  # test that the score returned is finite for every metric learner
+  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  estimator.fit(input_data, labels)
+  with pytest.warns(None) as record:
+    estimator.calibrate_threshold(input_data, labels, **kwargs)
+  assert len(record) == 0
+
+
 class IdentityPairsClassifier(MahalanobisMixin, _PairsClassifierMixin):
   """A simple pairs classifier for testing purposes, that will just have
-  identity as transformer_.
+  identity as transformer_, and a string threshold so that it returns an
+  error if not explicitely set.
   """
   def fit(self, pairs, y):
     pairs, y = self._prepare_inputs(pairs, y,
                                     type_of_inputs='tuples')
     self.transformer_ = np.atleast_2d(np.identity(pairs.shape[2]))
+    self.threshold_ = 'I am not set.'
     return self
 
 
+def test_set_threshold():
+  # test that set_threshold indeed sets the threshold
+  identity_pairs_classifier = IdentityPairsClassifier()
+  pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]])
+  y = np.array([1, 1, -1, -1])
+  identity_pairs_classifier.fit(pairs, y)
+  identity_pairs_classifier.set_threshold(0.5)
+  assert identity_pairs_classifier.threshold_ == 0.5
+
+
 def test_set_default_threshold_toy_example():
   # test that the default threshold has the right value on a toy example
   identity_pairs_classifier = IdentityPairsClassifier()
@@ -85,3 +127,194 @@ def test_set_default_threshold_toy_example():
   identity_pairs_classifier.fit(pairs, y)
   identity_pairs_classifier._set_default_threshold(pairs, y)
   assert identity_pairs_classifier.threshold_ == 2.5
+
+
+def test_f_beta_1_is_f_1():
+  # test that putting beta to 1 indeed finds the best threshold to optimize
+  # the f1_score
+  rng = np.random.RandomState(42)
+  n_samples = 100
+  pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  pairs_learner.calibrate_threshold(pairs, y, strategy='f_beta', beta=1)
+  best_f1_score = f1_score(y, pairs_learner.predict(pairs))
+  for threshold in - pairs_learner.decision_function(pairs):
+    pairs_learner.set_threshold(threshold)
+    assert f1_score(y, pairs_learner.predict(pairs)) <= best_f1_score
+
+
+def true_pos_true_neg_rates(y_true, y_pred):
+  """A function that returns the true positive rates and the true negatives
+  rate. For testing purposes (optimized for readability not performance)."""
+  assert y_pred.shape[0] == y_true.shape[0]
+  tp = np.sum((y_pred == 1) * (y_true == 1))
+  tn = np.sum((y_pred == -1) * (y_true == -1))
+  fn = np.sum((y_pred == -1) * (y_true == 1))
+  fp = np.sum((y_pred == 1) * (y_true == -1))
+  tpr = tp / (tp + fn)
+  tnr = tn / (tn + fp)
+  tpr = tpr if not np.isnan(tpr) else 0.
+  tnr = tnr if not np.isnan(tnr) else 0.
+  return tpr, tnr
+
+
+def tpr_threshold(y_true, y_pred, tnr_threshold=0.):
+  """A function that returns the true positive rate if the true negative
+  rate is higher or equal than `threshold`, and -1 otherwise. For testing
+  purposes"""
+  tpr, tnr = true_pos_true_neg_rates(y_true, y_pred)
+  if tnr < tnr_threshold:
+    return -1
+  else:
+    return tpr
+
+
+def tnr_threshold(y_true, y_pred, tpr_threshold=0.):
+  """A function that returns the true negative rate if the true positive
+  rate is higher or equal than `threshold`, and -1 otherwise. For testing
+  purposes"""
+  tpr, tnr = true_pos_true_neg_rates(y_true, y_pred)
+  if tpr < tpr_threshold:
+    return -1
+  else:
+    return tnr
+
+
+@pytest.mark.parametrize('kwargs, scoring',
+                         [({'strategy': 'accuracy'}, accuracy_score),
+                          *[({'strategy': 'f_beta', 'beta': b},
+                             partial(fbeta_score, beta=b))
+                             for b in [0.1, 0.5, 1.]],
+                          ({'strategy': 'f_beta', 'beta': 0}, precision_score),
+                          *[({'strategy': 'max_tpr', 'threshold': t},
+                             partial(tpr_threshold, tnr_threshold=t))
+                             for t in [0., 0.1, 0.5, 0.8, 1.]],
+                          *[({'strategy': 'max_tnr', 'threshold': t},
+                             partial(tnr_threshold, tpr_threshold=t))
+                             for t in [0., 0.1, 0.5, 0.8, 1.]],
+                          ])
+def test_found_score_is_best_score(kwargs, scoring):
+  # test that when we use calibrate threshold, it will indeed be the
+  # threshold that have the best score
+  rng = np.random.RandomState(42)
+  n_samples = 50
+  pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  pairs_learner.calibrate_threshold(pairs, y, **kwargs)
+  best_score = scoring(y, pairs_learner.predict(pairs))
+  scores = []
+  i = 0
+  predicted_scores = pairs_learner.decision_function(pairs)
+  predicted_scores = np.hstack([[np.min(predicted_scores) - 1],
+                                predicted_scores,
+                                [np.max(predicted_scores) + 1]])
+  for threshold in - predicted_scores:
+    pairs_learner.set_threshold(threshold)
+    score = scoring(y, pairs_learner.predict(pairs))
+    i += 1
+    assert score <= best_score
+    scores.append(score)
+  assert len(set(scores)) > 1  # assert that we didn't always have the same
+  # value for the score (which could be a hint for some bug, but would still
+  # silently pass the test))
+
+
+@pytest.mark.parametrize('kwargs, scoring',
+                         [({'strategy': 'accuracy'}, accuracy_score),
+                          *[({'strategy': 'f_beta', 'beta': b},
+                             partial(fbeta_score, beta=b))
+                             for b in [0.1, 0.5, 1.]],
+                          ({'strategy': 'f_beta', 'beta': 0}, precision_score),
+                          *[({'strategy': 'max_tpr', 'threshold': t},
+                             partial(tpr_threshold, tnr_threshold=t))
+                             for t in [0., 0.1, 0.5, 0.8, 1.]],
+                          *[({'strategy': 'max_tnr', 'threshold': t},
+                             partial(tnr_threshold, tpr_threshold=t))
+                             for t in [0., 0.1, 0.5, 0.8, 1.]],
+                          ])
+def test_found_score_is_best_score_duplicates(kwargs, scoring):
+  # test that when we use calibrate threshold, it will indeed be the
+  # threshold that have the best score. It's the same as the previous test
+  # except this time we test that the scores are coherent even if there are
+  # duplicates (i.e. points that have the same score returned by
+  # `decision_function`).
+  rng = np.random.RandomState(42)
+  n_samples = 50
+  pairs, y = rng.randn(n_samples, 2, 5), rng.choice([-1, 1], size=n_samples)
+  # we create some duplicates points, which will also have the same score
+  # predicted
+  pairs[6:10] = pairs[10:14]
+  y[6:10] = y[10:14]
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  pairs_learner.calibrate_threshold(pairs, y, **kwargs)
+  best_score = scoring(y, pairs_learner.predict(pairs))
+  scores = []
+  i = 0
+  predicted_scores = pairs_learner.decision_function(pairs)
+  predicted_scores = np.hstack([[np.min(predicted_scores) - 1],
+                                predicted_scores,
+                                [np.max(predicted_scores) + 1]])
+  for threshold in - predicted_scores:
+    pairs_learner.set_threshold(threshold)
+    score = scoring(y, pairs_learner.predict(pairs))
+    i += 1
+    assert score <= best_score
+    scores.append(score)
+  assert len(set(scores)) > 1  # assert that we didn't always have the same
+  # value for the score (which could be a hint for some bug, but would still
+  # silently pass the test))
+
+
+@pytest.mark.parametrize('invalid_args, expected_msg',
+                         [({'strategy': 'weird'},
+                          ('Strategy can either be "accuracy", "f_beta" or '
+                           '"max_tpr" or "max_tnr". Got "weird" instead.')),
+                          *[({'strategy': strategy, 'threshold': threshold},
+                              'Parameter threshold must be a number in'
+                              '[0, 1]. Got {} instead.'.format(threshold))
+                             for (strategy, threshold) in product(
+                            ['max_tpr', 'max_tnr'],
+                            [None, 'weird', -0.2, 1.2, 3 + 2j])],
+                          *[({'strategy': 'f_beta', 'beta': beta},
+                              'Parameter beta must be a real number. '
+                              'Got {} instead.'.format(type(beta)))
+                            for beta in [None, 'weird', 3 + 2j]]
+                          ])
+def test_calibrate_threshold_invalid_parameters_right_error(invalid_args,
+                                                            expected_msg):
+  # test that the right error message is returned if invalid arguments are
+  # given to calibrate_threshold
+  rng = np.random.RandomState(42)
+  pairs, y = rng.randn(20, 2, 5), rng.choice([-1, 1], size=20)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  with pytest.raises(ValueError) as raised_error:
+    pairs_learner.calibrate_threshold(pairs, y, **invalid_args)
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('valid_args',
+                         [{'strategy': 'accuracy'},
+                          *[{'strategy': strategy, 'threshold': threshold}
+                             for (strategy, threshold) in product(
+                            ['max_tpr', 'max_tnr'],
+                            [0., 0.2, 0.8, 1.])],
+                          *[{'strategy': 'f_beta', 'beta': beta}
+                            for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]]
+                          # Note that we authorize beta < 0 (even if
+                          # in fact it will be squared, so it would be useless
+                          # to do that)
+                          ])
+def test_calibrate_threshold_valid_parameters(valid_args):
+  # test that no warning message is returned if valid arguments are given to
+  # calibrate threshold
+  rng = np.random.RandomState(42)
+  pairs, y = rng.randn(20, 2, 5), rng.choice([-1, 1], size=20)
+  pairs_learner = IdentityPairsClassifier()
+  pairs_learner.fit(pairs, y)
+  with pytest.warns(None) as record:
+    pairs_learner.calibrate_threshold(pairs, y, **valid_args)
+  assert len(record) == 0

From a6458a228089dca7f8c7c6c2a435e63ae2984edb Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 5 Mar 2019 09:02:14 +0100
Subject: [PATCH 17/41] MAINT: remove starred syntax for compatibility with
 older versions of python

---
 test/test_pairs_classifiers.py | 113 +++++++++++++++++----------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index adeeb3cb..a03759fd 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -72,14 +72,13 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
 
 
 @pytest.mark.parametrize('kwargs',
-                         [{'strategy': 'accuracy'},
-                          *[{'strategy': strategy, 'threshold': threshold}
-                             for (strategy, threshold) in product(
-                            ['max_tpr', 'max_tnr'],
-                            [0., 0.2, 0.8, 1.])],
-                          *[{'strategy': 'f_beta', 'beta': beta}
-                            for beta in [0., 0.1, 0.2, 1., 5.]]
-                          ])
+                         [{'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'threshold': threshold}
+                          for (strategy, threshold) in product(
+                              ['max_tpr', 'max_tnr'], [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [0., 0.1, 0.2, 1., 5.]]
+                         )
 @pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)
@@ -182,18 +181,19 @@ def tnr_threshold(y_true, y_pred, tpr_threshold=0.):
 
 
 @pytest.mark.parametrize('kwargs, scoring',
-                         [({'strategy': 'accuracy'}, accuracy_score),
-                          *[({'strategy': 'f_beta', 'beta': b},
-                             partial(fbeta_score, beta=b))
-                             for b in [0.1, 0.5, 1.]],
-                          ({'strategy': 'f_beta', 'beta': 0}, precision_score),
-                          *[({'strategy': 'max_tpr', 'threshold': t},
-                             partial(tpr_threshold, tnr_threshold=t))
-                             for t in [0., 0.1, 0.5, 0.8, 1.]],
-                          *[({'strategy': 'max_tnr', 'threshold': t},
-                             partial(tnr_threshold, tpr_threshold=t))
-                             for t in [0., 0.1, 0.5, 0.8, 1.]],
-                          ])
+                         [({'strategy': 'accuracy'}, accuracy_score)] +
+                         [({'strategy': 'f_beta', 'beta': b},
+                           partial(fbeta_score, beta=b))
+                          for b in [0.1, 0.5, 1.]] +
+                         [({'strategy': 'f_beta', 'beta': 0},
+                           precision_score)] +
+                         [({'strategy': 'max_tpr', 'threshold': t},
+                           partial(tpr_threshold, tnr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]] +
+                         [({'strategy': 'max_tnr', 'threshold': t},
+                           partial(tnr_threshold, tpr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]],
+                         )
 def test_found_score_is_best_score(kwargs, scoring):
   # test that when we use calibrate threshold, it will indeed be the
   # threshold that have the best score
@@ -222,18 +222,19 @@ def test_found_score_is_best_score(kwargs, scoring):
 
 
 @pytest.mark.parametrize('kwargs, scoring',
-                         [({'strategy': 'accuracy'}, accuracy_score),
-                          *[({'strategy': 'f_beta', 'beta': b},
-                             partial(fbeta_score, beta=b))
-                             for b in [0.1, 0.5, 1.]],
-                          ({'strategy': 'f_beta', 'beta': 0}, precision_score),
-                          *[({'strategy': 'max_tpr', 'threshold': t},
-                             partial(tpr_threshold, tnr_threshold=t))
-                             for t in [0., 0.1, 0.5, 0.8, 1.]],
-                          *[({'strategy': 'max_tnr', 'threshold': t},
-                             partial(tnr_threshold, tpr_threshold=t))
-                             for t in [0., 0.1, 0.5, 0.8, 1.]],
-                          ])
+                         [({'strategy': 'accuracy'}, accuracy_score)] +
+                         [({'strategy': 'f_beta', 'beta': b},
+                           partial(fbeta_score, beta=b))
+                          for b in [0.1, 0.5, 1.]] +
+                         [({'strategy': 'f_beta', 'beta': 0},
+                           precision_score)] +
+                         [({'strategy': 'max_tpr', 'threshold': t},
+                           partial(tpr_threshold, tnr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]] +
+                         [({'strategy': 'max_tnr', 'threshold': t},
+                           partial(tnr_threshold, tpr_threshold=t))
+                          for t in [0., 0.1, 0.5, 0.8, 1.]]
+                         )
 def test_found_score_is_best_score_duplicates(kwargs, scoring):
   # test that when we use calibrate threshold, it will indeed be the
   # threshold that have the best score. It's the same as the previous test
@@ -270,19 +271,19 @@ def test_found_score_is_best_score_duplicates(kwargs, scoring):
 
 @pytest.mark.parametrize('invalid_args, expected_msg',
                          [({'strategy': 'weird'},
-                          ('Strategy can either be "accuracy", "f_beta" or '
-                           '"max_tpr" or "max_tnr". Got "weird" instead.')),
-                          *[({'strategy': strategy, 'threshold': threshold},
-                              'Parameter threshold must be a number in'
-                              '[0, 1]. Got {} instead.'.format(threshold))
-                             for (strategy, threshold) in product(
-                            ['max_tpr', 'max_tnr'],
-                            [None, 'weird', -0.2, 1.2, 3 + 2j])],
-                          *[({'strategy': 'f_beta', 'beta': beta},
-                              'Parameter beta must be a real number. '
-                              'Got {} instead.'.format(type(beta)))
-                            for beta in [None, 'weird', 3 + 2j]]
-                          ])
+                           ('Strategy can either be "accuracy", "f_beta" or '
+                            '"max_tpr" or "max_tnr". Got "weird" instead.'))] +
+                         [({'strategy': strategy, 'threshold': threshold},
+                           'Parameter threshold must be a number in'
+                           '[0, 1]. Got {} instead.'.format(threshold))
+                          for (strategy, threshold) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [None, 'weird', -0.2, 1.2, 3 + 2j])] +
+                         [({'strategy': 'f_beta', 'beta': beta},
+                           'Parameter beta must be a real number. '
+                           'Got {} instead.'.format(type(beta)))
+                          for beta in [None, 'weird', 3 + 2j]]
+                         )
 def test_calibrate_threshold_invalid_parameters_right_error(invalid_args,
                                                             expected_msg):
   # test that the right error message is returned if invalid arguments are
@@ -297,17 +298,17 @@ def test_calibrate_threshold_invalid_parameters_right_error(invalid_args,
 
 
 @pytest.mark.parametrize('valid_args',
-                         [{'strategy': 'accuracy'},
-                          *[{'strategy': strategy, 'threshold': threshold}
-                             for (strategy, threshold) in product(
-                            ['max_tpr', 'max_tnr'],
-                            [0., 0.2, 0.8, 1.])],
-                          *[{'strategy': 'f_beta', 'beta': beta}
-                            for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]]
-                          # Note that we authorize beta < 0 (even if
-                          # in fact it will be squared, so it would be useless
-                          # to do that)
-                          ])
+                         [{'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'threshold': threshold}
+                          for (strategy, threshold) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]]
+                         # Note that we authorize beta < 0 (even if
+                         # in fact it will be squared, so it would be useless
+                         # to do that)
+                         )
 def test_calibrate_threshold_valid_parameters(valid_args):
   # test that no warning message is returned if valid arguments are given to
   # calibrate threshold

From fada5cc42c0c62c76481d0344d23b86d9182dbcd Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 5 Mar 2019 13:29:14 +0100
Subject: [PATCH 18/41] Remove debugging prints and make tests for ITML pass,
 while waiting for #175 to be solved

---
 test/test_pairs_classifiers.py | 4 ----
 test/test_utils.py             | 7 +++++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index a03759fd..240b81cb 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -205,7 +205,6 @@ def test_found_score_is_best_score(kwargs, scoring):
   pairs_learner.calibrate_threshold(pairs, y, **kwargs)
   best_score = scoring(y, pairs_learner.predict(pairs))
   scores = []
-  i = 0
   predicted_scores = pairs_learner.decision_function(pairs)
   predicted_scores = np.hstack([[np.min(predicted_scores) - 1],
                                 predicted_scores,
@@ -213,7 +212,6 @@ def test_found_score_is_best_score(kwargs, scoring):
   for threshold in - predicted_scores:
     pairs_learner.set_threshold(threshold)
     score = scoring(y, pairs_learner.predict(pairs))
-    i += 1
     assert score <= best_score
     scores.append(score)
   assert len(set(scores)) > 1  # assert that we didn't always have the same
@@ -253,7 +251,6 @@ def test_found_score_is_best_score_duplicates(kwargs, scoring):
   pairs_learner.calibrate_threshold(pairs, y, **kwargs)
   best_score = scoring(y, pairs_learner.predict(pairs))
   scores = []
-  i = 0
   predicted_scores = pairs_learner.decision_function(pairs)
   predicted_scores = np.hstack([[np.min(predicted_scores) - 1],
                                 predicted_scores,
@@ -261,7 +258,6 @@ def test_found_score_is_best_score_duplicates(kwargs, scoring):
   for threshold in - predicted_scores:
     pairs_learner.set_threshold(threshold)
     score = scoring(y, pairs_learner.predict(pairs))
-    i += 1
     assert score <= best_score
     scores.append(score)
   assert len(set(scores)) > 1  # assert that we didn't always have the same
diff --git a/test/test_utils.py b/test/test_utils.py
index 38226fef..afd54288 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -102,8 +102,11 @@ def build_quadruplets(with_preprocessor=False):
                                 [learner for (learner, _) in
                                  quadruplets_learners]))
 
-pairs_learners = [(ITML(), build_pairs),
-                  (MMC(max_iter=2), build_pairs),  # max_iter=2 for faster
+pairs_learners = [(ITML(max_iter=2), build_pairs),  # max_iter=2 to be
+                  # faster, also make tests pass while waiting for #175 to
+                  # be solved
+                  # TODO: remove this comment when #175 is solved
+                  (MMC(max_iter=2), build_pairs),  # max_iter=2 to be faster
                   (SDML(), build_pairs),
                   ]
 ids_pairs_learners = list(map(lambda x: x.__class__.__name__,

From 32a48897d7e4bfaaea9cfbdceb4778c7e5e3115b Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 5 Mar 2019 13:52:49 +0100
Subject: [PATCH 19/41] FIX: from __future__ import division to pass tests for
 python 2.7

---
 test/test_pairs_classifiers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index 240b81cb..298e0c5f 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -1,3 +1,5 @@
+from __future__ import division
+
 from functools import partial
 
 import pytest

From 5cf71b909a9a3f85daa0922252896e9d38a76634 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 11 Mar 2019 15:02:33 +0100
Subject: [PATCH 20/41] Add some documentation for calibration

---
 doc/conf.py                    |  13 +++-
 doc/weakly_supervised.rst      | 112 +++++++++++++++++++++++----------
 metric_learn/base_metric.py    |  15 ++++-
 metric_learn/itml.py           |   2 +
 metric_learn/mmc.py            |   2 +-
 metric_learn/sdml.py           |   2 +-
 test/test_pairs_classifiers.py |   2 +-
 7 files changed, 108 insertions(+), 40 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index f0faa2f8..8f5fdcaa 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import sys
 
 extensions = [
     'sphinx.ext.autodoc',
@@ -8,7 +9,8 @@
     'sphinx.ext.mathjax',
     'numpydoc',
     'sphinx_gallery.gen_gallery',
-    'sphinx.ext.doctest'
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx'
 ]
 
 templates_path = ['_templates']
@@ -39,3 +41,12 @@
 # Option to hide doctests comments in the documentation (like # doctest:
 # +NORMALIZE_WHITESPACE for instance)
 trim_doctest_flags = True
+
+# intersphinx configuration
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/{.major}'.format(
+        sys.version_info), None),
+    'numpy': ('https://docs.scipy.org/doc/numpy/', None),
+    'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
+    'scikit-learn': ('https://scikit-learn.org/stable/', None)
+}
\ No newline at end of file
diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index deae9b40..8111238b 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -148,6 +148,42 @@ tuples you're working with (pairs, triplets...). See the docstring of the
 `score` method of the estimator you use.
 
 
+Learning on pairs
+=================
+
+Some metric learning algorithms learn on pairs of samples. In this case, one
+should provide the algorithm with ``n_samples`` pairs of points, with a
+corresponding target containing ``n_samples`` values being either +1 or -1.
+These values indicate whether the given pairs are similar points or
+dissimilar points.
+
+
+.. _calibration:
+
+Thresholding
+------------
+In order to predict whether a new pair represents similar or dissimilar
+samples, we need to set a distance threshold, so that points closer (in the
+learned space) than this threshold are predicted as similar, and points further
+away are predicted as dissimilar. Several methods are possible for this
+thresholding.
+
+- **default**: Unless explicitely stated in the `fit` method documentation
+  of the estimator, the threshold is set with the method
+  `set_default_threshold` on the trainset.
+
+- **manual**: calling `set_threshold`, the user can
+  manually set the threshold to a particular value.
+
+- **calibrated**: calling `calibrate_threshold`, the user can
+  calibrate the threshold to achieve a particular score on a validation set,
+  the score being among the classical scores for classification (accuracy, f1
+  score...).
+
+
+See also: `sklearn.calibration`.
+
+
 Algorithms
 ==================
 
@@ -192,39 +228,6 @@ programming.
     .. [2] Adapted from Matlab code at http://www.cs.utexas.edu/users/pjain/
        itml/
 
-
-LSML
-----
-
-`LSML`: Metric Learning from Relative Comparisons by Minimizing Squared
-Residual
-
-.. topic:: Example Code:
-
-::
-
-    from metric_learn import LSML
-
-    quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]],
-                   [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]],
-                   [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]],
-                   [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]]
-
-    # we want to make closer points where the first feature is close, and
-    # further if the second feature is close
-
-    lsml = LSML()
-    lsml.fit(quadruplets)
-
-.. topic:: References:
-
-    .. [1] Liu et al.
-       "Metric Learning from Relative Comparisons by Minimizing Squared
-       Residual". ICDM 2012. http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf
-
-    .. [2] Adapted from https://gist.github.com/kcarnold/5439917
-
-
 SDML
 ----
 
@@ -343,3 +346,46 @@ method. However, it is one of the earliest and a still often cited technique.
         -with-side-information.pdf>`_ Xing, Jordan, Russell, Ng.
   .. [2] Adapted from Matlab code `here <http://www.cs.cmu
      .edu/%7Eepxing/papers/Old_papers/code_Metric_online.tar.gz>`_.
+
+Learning on quadruplets
+=======================
+
+A type of information even weaker than pairs is information about relative
+comparisons between pairs. The user should provide the algorithm with a
+quadruplet of points, where the two first points are closer than the two
+last points. No target vector (``y``) is needed, since the supervision is
+already in the order that points are given in the quadruplet.
+
+Algorithms
+==================
+
+LSML
+----
+
+`LSML`: Metric Learning from Relative Comparisons by Minimizing Squared
+Residual
+
+.. topic:: Example Code:
+
+::
+
+    from metric_learn import LSML
+
+    quadruplets = [[[1.2, 7.5], [1.3, 1.5], [6.4, 2.6], [6.2, 9.7]],
+                   [[1.3, 4.5], [3.2, 4.6], [6.2, 5.5], [5.4, 5.4]],
+                   [[3.2, 7.5], [3.3, 1.5], [8.4, 2.6], [8.2, 9.7]],
+                   [[3.3, 4.5], [5.2, 4.6], [8.2, 5.5], [7.4, 5.4]]]
+
+    # we want to make closer points where the first feature is close, and
+    # further if the second feature is close
+
+    lsml = LSML()
+    lsml.fit(quadruplets)
+
+.. topic:: References:
+
+    .. [1] Liu et al.
+       "Metric Learning from Relative Comparisons by Minimizing Squared
+       Residual". ICDM 2012. http://www.cs.ucla.edu/~weiwang/paper/ICDM12.pdf
+
+    .. [2] Adapted from https://gist.github.com/kcarnold/5439917
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 60c6859f..c16c95d1 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -386,9 +386,14 @@ def score(self, pairs, y):
     """
     return roc_auc_score(y, self.decision_function(pairs))
 
-  def _set_default_threshold(self, pairs, y):
-    """Returns a threshold that is the opposite of the mean between the similar
-    metrics mean and the dissimilar metrics mean"""
+  def set_default_threshold(self, pairs, y):
+    """Sets the default threshold on the given dataset.
+
+    Returns a threshold that is the mean between the similar
+    metrics mean and the dissimilar metrics mean.
+
+    See more in the :ref:`User Guide <calibration>`.
+    """
     similar_threshold = np.mean(self.score_pairs(
         pairs[(y == 1).ravel()]))
     dissimilar_threshold = np.mean(self.score_pairs(
@@ -398,6 +403,8 @@ def _set_default_threshold(self, pairs, y):
   def set_threshold(self, threshold):
     """Sets the threshold of the metric learner to the given value `threshold
 
+    See more in the :ref:`User Guide <calibration>`.
+
     Parameters
     ----------
     threshold : float
@@ -425,6 +432,8 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
     true negative rates and F-scores. The threshold will be found to maximize
     the chosen score on the validation set `(pairs_valid, y_valid)`.
 
+    See more in the :ref:`User Guide <calibration>`.
+
     Parameters
     ----------
     strategy : str, optional (default='roc')
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index aa75463e..7eeec13e 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -163,6 +163,8 @@ class ITML(_BaseITML, _PairsClassifierMixin):
   def fit(self, pairs, y, bounds=None):
     """Learn the ITML model.
 
+    The default threshold will be set to the mean of the bounds.
+
     Parameters
     ----------
     pairs: array-like, shape=(n_constraints, 2, n_features) or
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 3892a969..138b1d71 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -390,7 +390,7 @@ def fit(self, pairs, y):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self._set_default_threshold(pairs, y)
+    self.set_default_threshold(pairs, y)
     return self
 
 
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 359e4fe1..536bd28a 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -112,7 +112,7 @@ def fit(self, pairs, y):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self._set_default_threshold(pairs, y)
+    self.set_default_threshold(pairs, y)
     return self
 
 
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index 298e0c5f..8ee20d3a 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -126,7 +126,7 @@ def test_set_default_threshold_toy_example():
   pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]])
   y = np.array([1, 1, -1, -1])
   identity_pairs_classifier.fit(pairs, y)
-  identity_pairs_classifier._set_default_threshold(pairs, y)
+  identity_pairs_classifier.set_default_threshold(pairs, y)
   assert identity_pairs_classifier.threshold_ == 2.5
 
 

From c2bc693b568e2c294362703940f1f4c54f4dfd11 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Mon, 11 Mar 2019 15:27:27 +0100
Subject: [PATCH 21/41] DOC: fix style

---
 doc/weakly_supervised.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 8111238b..87a0ac6b 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -185,7 +185,7 @@ See also: `sklearn.calibration`.
 
 
 Algorithms
-==================
+==========
 
 ITML
 ----
@@ -357,7 +357,7 @@ last points. No target vector (``y``) is needed, since the supervision is
 already in the order that points are given in the quadruplet.
 
 Algorithms
-==================
+==========
 
 LSML
 ----

From 3ed3430b1cbae48a566a12cfa736ac38943e2388 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 10:32:53 +0100
Subject: [PATCH 22/41] Address most comments from aurelien's reviews

---
 doc/weakly_supervised.rst      | 11 +++--
 metric_learn/base_metric.py    | 77 ++++++++++++++++++----------------
 metric_learn/itml.py           |  2 +-
 test/test_pairs_classifiers.py | 26 ++++++------
 4 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 87a0ac6b..2868f9e8 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -172,13 +172,12 @@ thresholding.
   of the estimator, the threshold is set with the method
   `set_default_threshold` on the trainset.
 
-- **manual**: calling `set_threshold`, the user can
-  manually set the threshold to a particular value.
+- **manual**: calling `set_threshold` will set the threshold to a
+  particular value.
 
-- **calibrated**: calling `calibrate_threshold`, the user can
-  calibrate the threshold to achieve a particular score on a validation set,
-  the score being among the classical scores for classification (accuracy, f1
-  score...).
+- **calibrated**: calling `calibrate_threshold` will calibrate the threshold to
+  achieve a particular score on a validation set, the score being among the
+  classical scores for classification (accuracy, f1 score...).
 
 
 See also: `sklearn.calibration`.
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index c16c95d1..be2f5589 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -304,9 +304,9 @@ class _PairsClassifierMixin(BaseMetricLearner):
       classified as dissimilar.
 
   classes_ : `list`
-      The possible labels of the pairs `MMC` can fit on. `classes_ = [-1, 1]`,
-      where -1 means points in a pair are dissimilar (negative label), and 1
-      means they are similar (positive label).
+      The possible labels of the pairs the metric learner can fit on.
+      `classes_ = [-1, 1]`, where -1 means points in a pair are dissimilar
+      (negative label), and 1 means they are similar (positive label).
   """
 
   classes_ = [-1, 1]
@@ -338,9 +338,10 @@ def decision_function(self, pairs):
     """Returns the decision function used to classify the pairs.
 
     Returns the opposite of the learned metric value between samples in every
-    pair. Hence it should ideally be low for dissimilar samples and high for
-    similar samples. This is the decision function that is used to classify
-    pairs as similar (+1), or dissimilar (-1).
+    pair, to be consistent with scikit-learn conventions. Hence it should
+    ideally be low for dissimilar samples and high for similar samples.
+    This is the decision function that is used to classify pairs as similar
+    (+1), or dissimilar (-1).
 
     Parameters
     ----------
@@ -401,17 +402,17 @@ def set_default_threshold(self, pairs, y):
     self.threshold_ = np.mean([similar_threshold, dissimilar_threshold])
 
   def set_threshold(self, threshold):
-    """Sets the threshold of the metric learner to the given value `threshold
+    """Sets the threshold of the metric learner to the given value `threshold`.
 
     See more in the :ref:`User Guide <calibration>`.
 
     Parameters
     ----------
     threshold : float
-      The threshold value we want to set. It's a distance metric with
-      respect to which the predicted distance metric for test pairs will be
-      compared to. If they are superior to the threshold they will be
-      classified as similar (+1), and dissimilar (-1) if not.
+      The threshold value we want to set. It is the value to which the
+      predicted distance for test pairs will be compared. If they are superior
+      to the threshold they will be classified as similar (+1),
+      and dissimilar (-1) if not.
 
     Returns
     -------
@@ -422,50 +423,51 @@ def set_threshold(self, threshold):
     return self
 
   def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
-                          threshold=None, beta=None):
-    """Decision threshold calibration for binary classification
+                          min_rate=None, beta=1.):
+    """Decision threshold calibration for pairwise binary classification
 
     Method that calibrates the decision threshold (cutoff point) of the metric
     learner. This threshold will then be used when calling the method
     `predict`. The methods for picking cutoff points make use of traditional
     binary classification evaluation statistics such as the true positive and
     true negative rates and F-scores. The threshold will be found to maximize
-    the chosen score on the validation set `(pairs_valid, y_valid)`.
+    the chosen score on the validation set ``(pairs_valid, y_valid)``.
 
     See more in the :ref:`User Guide <calibration>`.
 
     Parameters
     ----------
-    strategy : str, optional (default='roc')
-      The strategy to use for choosing the cutoff point
+    strategy : str, optional (default='accuracy')
+      The strategy to use for choosing the cutoff threshold.
 
       'accuracy'
-          selects a decision threshold that maximizes the accuracy
+          Selects a decision threshold that maximizes the accuracy.
       'f_beta'
-          selects a decision threshold that maximizes the f_beta score
+          Selects a decision threshold that maximizes the f_beta score,
+          with beta given by the parameter `beta`.
       'max_tpr'
-          selects the point that yields the highest true positive rate with
-          true negative rate at least equal to the value of the parameter
-          threshold
+          Selects a decision threshold that yields the highest true positive
+          rate with true negative rate at least equal to the value of the
+          parameter `min_rate`.
       'max_tnr'
-          selects the point that yields the highest true negative rate with
-          true positive rate at least equal to the value of the parameter
-          threshold
+          Selects a decision threshold that yields the highest true negative
+          rate with true positive rate at least equal to the value of the
+          parameter `min_rate`.
 
     beta : float in [0, 1], optional (default=None)
-      beta value to be used in case strategy == 'f_beta'
+      Beta value to be used in case strategy == 'f_beta'.
 
-    threshold : float in [0, 1] or None, (default=None)
+    min_rate : float in [0, 1] or None, (default=None)
       In case strategy is 'max_tpr' or 'max_tnr' this parameter must be set
-      to specify the threshold for the true negative rate or true positive
-      rate respectively that needs to be achieved
+      to specify the minimal value for the true negative rate or true positive
+      rate respectively that needs to be achieved.
 
     pairs_valid : array-like, shape=(n_pairs_valid, 2, n_features)
       The validation set of pairs to use to set the threshold.
 
     y_valid : array-like, shape=(n_pairs_valid,)
       The labels of the pairs of the validation set to use to set the
-      threshold.
+      threshold. They must be +1 for positive pairs and -1 for negative pairs.
 
     References
     ----------
@@ -487,11 +489,11 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
                        .format(strategy))
 
     if strategy == 'max_tpr' or strategy == 'max_tnr':
-      if (threshold is None or not isinstance(threshold, (int, float)) or
-          not threshold >= 0 or not threshold <= 1):
-        raise ValueError('Parameter threshold must be a number in'
+      if (min_rate is None or not isinstance(min_rate, (int, float)) or
+              not min_rate >= 0 or not min_rate <= 1):
+        raise ValueError('Parameter min_rate must be a number in'
                          '[0, 1]. '
-                         'Got {} instead.'.format(threshold))
+                         'Got {} instead.'.format(min_rate))
 
     if strategy == 'f_beta':
       if beta is None or not isinstance(beta, (int, float)):
@@ -514,9 +516,9 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1)
       cum_tn = np.concatenate([[0], cum_tn_inverted[:-1]])[::-1]
       cum_accuracy = (cum_tp + cum_tn) / n_samples
-      max_i = np.argmax(cum_accuracy)
+      imax = np.argmax(cum_accuracy)
       # note: we want a positive threshold (distance), so we take - threshold
-      self.threshold_ = - scores_sorted[max_i]
+      self.threshold_ = - scores_sorted[imax]
       return self
 
     if strategy == 'f_beta':
@@ -527,6 +529,7 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
                   (beta**2 * precision + recall))
       f_beta[np.isnan(f_beta)] = 0.
       imax = np.argmax(f_beta)
+      # note: we want a positive threshold (distance), so we take - threshold
       self.threshold_ = - thresholds[imax]
       return self
 
@@ -536,13 +539,13 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
     fpr, tpr, thresholds = fpr, tpr, thresholds
 
     if strategy == 'max_tpr':
-      indices = np.where(1 - fpr >= threshold)[0]
+      indices = np.where(1 - fpr >= min_rate)[0]
       max_tpr_index = np.argmax(tpr[indices])
       # note: we want a positive threshold (distance), so we take - threshold
       self.threshold_ = - thresholds[indices[max_tpr_index]]
 
     if strategy == 'max_tnr':
-      indices = np.where(tpr >= threshold)[0]
+      indices = np.where(tpr >= min_rate)[0]
       max_tnr_index = np.argmax(1 - fpr[indices])
       # note: we want a positive threshold (distance), so we take - threshold
       self.threshold_ = - thresholds[indices[max_tnr_index]]
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 7eeec13e..e31368b0 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -155,7 +155,7 @@ class ITML(_BaseITML, _PairsClassifierMixin):
       classified as dissimilar.
 
   classes_ : `list`
-      The possible labels of the pairs `LSML` can fit on. `classes_ = [-1, 1]`,
+      The possible labels of the pairs `ITML` can fit on. `classes_ = [-1, 1]`,
       where -1 means points in a pair are dissimilar (negative label), and 1
       means they are similar (positive label).
   """
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index 8ee20d3a..d467e965 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -75,8 +75,8 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
 
 @pytest.mark.parametrize('kwargs',
                          [{'strategy': 'accuracy'}] +
-                         [{'strategy': strategy, 'threshold': threshold}
-                          for (strategy, threshold) in product(
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
                               ['max_tpr', 'max_tnr'], [0., 0.2, 0.8, 1.])] +
                          [{'strategy': 'f_beta', 'beta': beta}
                           for beta in [0., 0.1, 0.2, 1., 5.]]
@@ -84,8 +84,8 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
 @pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)
-def test_threshold_different_scores_is_finite(estimator, build_dataset,
-                                              with_preprocessor, kwargs):
+def test_min_rate_different_scores_is_finite(estimator, build_dataset,
+                                             with_preprocessor, kwargs):
   # test that the score returned is finite for every metric learner
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
@@ -189,10 +189,10 @@ def tnr_threshold(y_true, y_pred, tpr_threshold=0.):
                           for b in [0.1, 0.5, 1.]] +
                          [({'strategy': 'f_beta', 'beta': 0},
                            precision_score)] +
-                         [({'strategy': 'max_tpr', 'threshold': t},
+                         [({'strategy': 'max_tpr', 'min_rate': t},
                            partial(tpr_threshold, tnr_threshold=t))
                           for t in [0., 0.1, 0.5, 0.8, 1.]] +
-                         [({'strategy': 'max_tnr', 'threshold': t},
+                         [({'strategy': 'max_tnr', 'min_rate': t},
                            partial(tnr_threshold, tpr_threshold=t))
                           for t in [0., 0.1, 0.5, 0.8, 1.]],
                          )
@@ -228,7 +228,7 @@ def test_found_score_is_best_score(kwargs, scoring):
                           for b in [0.1, 0.5, 1.]] +
                          [({'strategy': 'f_beta', 'beta': 0},
                            precision_score)] +
-                         [({'strategy': 'max_tpr', 'threshold': t},
+                         [({'strategy': 'max_tpr', 'min_rate': t},
                            partial(tpr_threshold, tnr_threshold=t))
                           for t in [0., 0.1, 0.5, 0.8, 1.]] +
                          [({'strategy': 'max_tnr', 'threshold': t},
@@ -271,10 +271,10 @@ def test_found_score_is_best_score_duplicates(kwargs, scoring):
                          [({'strategy': 'weird'},
                            ('Strategy can either be "accuracy", "f_beta" or '
                             '"max_tpr" or "max_tnr". Got "weird" instead.'))] +
-                         [({'strategy': strategy, 'threshold': threshold},
-                           'Parameter threshold must be a number in'
-                           '[0, 1]. Got {} instead.'.format(threshold))
-                          for (strategy, threshold) in product(
+                         [({'strategy': strategy, 'min_rate': min_rate},
+                           'Parameter min_rate must be a number in'
+                           '[0, 1]. Got {} instead.'.format(min_rate))
+                          for (strategy, min_rate) in product(
                              ['max_tpr', 'max_tnr'],
                              [None, 'weird', -0.2, 1.2, 3 + 2j])] +
                          [({'strategy': 'f_beta', 'beta': beta},
@@ -297,8 +297,8 @@ def test_calibrate_threshold_invalid_parameters_right_error(invalid_args,
 
 @pytest.mark.parametrize('valid_args',
                          [{'strategy': 'accuracy'}] +
-                         [{'strategy': strategy, 'threshold': threshold}
-                          for (strategy, threshold) in product(
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
                              ['max_tpr', 'max_tnr'],
                              [0., 0.2, 0.8, 1.])] +
                          [{'strategy': 'f_beta', 'beta': beta}

From 69c694528c0b8d7cda71dddbcbe7cbe42996bf13 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 11:21:19 +0100
Subject: [PATCH 23/41] Remove classes_ attribute and test for
 CalibratedClassifierCV

---
 metric_learn/base_metric.py |  6 ------
 metric_learn/itml.py        |  5 -----
 metric_learn/mmc.py         |  5 -----
 metric_learn/sdml.py        |  5 -----
 test/test_sklearn_compat.py | 26 --------------------------
 5 files changed, 47 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index be2f5589..77672bba 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -302,14 +302,8 @@ class _PairsClassifierMixin(BaseMetricLearner):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
-
-  classes_ : `list`
-      The possible labels of the pairs the metric learner can fit on.
-      `classes_ = [-1, 1]`, where -1 means points in a pair are dissimilar
-      (negative label), and 1 means they are similar (positive label).
   """
 
-  classes_ = [-1, 1]
   _tuple_size = 2  # number of points in a tuple, 2 for pairs
 
   def predict(self, pairs):
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index e31368b0..57df1471 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -153,11 +153,6 @@ class ITML(_BaseITML, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
-
-  classes_ : `list`
-      The possible labels of the pairs `ITML` can fit on. `classes_ = [-1, 1]`,
-      where -1 means points in a pair are dissimilar (negative label), and 1
-      means they are similar (positive label).
   """
 
   def fit(self, pairs, y, bounds=None):
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 138b1d71..2ddcced2 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -364,11 +364,6 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
-
-  classes_ : `list`
-      The possible labels of the pairs `MMC` can fit on. `classes_ = [-1, 1]`,
-      where -1 means points in a pair are dissimilar (negative label), and 1
-      means they are similar (positive label).
   """
 
   def fit(self, pairs, y):
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 536bd28a..096dc0ed 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -86,11 +86,6 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
       If the distance metric between two points is lower than this threshold,
       points will be classified as similar, otherwise they will be
       classified as dissimilar.
-
-  classes_ : `list`
-      The possible labels of the pairs `SDML` can fit on. `classes_ = [-1, 1]`,
-      where -1 means points in a pair are dissimilar (negative label), and 1
-      means they are similar (positive label).
   """
 
   def fit(self, pairs, y):
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index e14e2cf9..5fc11aeb 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -1,6 +1,5 @@
 import pytest
 import unittest
-from sklearn.calibration import CalibratedClassifierCV
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.base import TransformerMixin
 from sklearn.pipeline import make_pipeline
@@ -92,31 +91,6 @@ def test_mmc(self):
 # ---------------------- Test scikit-learn compatibility ----------------------
 
 
-@pytest.mark.parametrize('with_preprocessor',
-                         [True,
-                          # TODO: uncomment the below line as soon as
-                          # https://github.com/scikit-learn/scikit-learn/
-                          # issues/13077 is solved:
-                          # False,
-                          ])
-@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
-                         ids=ids_pairs_learners)
-def test_calibrated_classifier_CV(estimator, build_dataset,
-                                  with_preprocessor):
-  """Tests that metric-learn tuples estimators' work with scikit-learn's
-  CalibratedClassifierCV.
-  """
-  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
-  estimator = clone(estimator)
-  estimator.set_params(preprocessor=preprocessor)
-  set_random_state(estimator)
-  calibrated_clf = CalibratedClassifierCV(estimator)
-
-  # test fit and predict_proba
-  calibrated_clf.fit(input_data, labels)
-  calibrated_clf.predict_proba(input_data)
-
-
 @pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)

From bc393927937564ca75f7d781dc58194e2a000820 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 13:38:03 +0100
Subject: [PATCH 24/41] Rename make_args_inc_quadruplets into
 remove_y_quadruplets

---
 test/test_mahalanobis_mixin.py | 34 +++++++++---------
 test/test_sklearn_compat.py    | 65 ++++++++++++++++++----------------
 test/test_utils.py             | 22 ++++++------
 3 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py
index a85d9e8f..c25c52b1 100644
--- a/test/test_mahalanobis_mixin.py
+++ b/test/test_mahalanobis_mixin.py
@@ -12,7 +12,7 @@
 from metric_learn._util import make_context
 
 from test.test_utils import (ids_metric_learners, metric_learners,
-                             make_args_inc_quadruplets)
+                             remove_y_quadruplets)
 
 RNG = check_random_state(0)
 
@@ -26,7 +26,7 @@ def test_score_pairs_pairwise(estimator, build_dataset):
   X = X[:n_samples]
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
 
   pairwise = model.score_pairs(np.array(list(product(X, X))))\
       .reshape(n_samples, n_samples)
@@ -50,7 +50,7 @@ def test_score_pairs_toy_example(estimator, build_dataset):
     X = X[:n_samples]
     model = clone(estimator)
     set_random_state(model)
-    model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+    model.fit(*remove_y_quadruplets(estimator, input_data, labels))
     pairs = np.stack([X[:10], X[10:20]], axis=1)
     embedded_pairs = pairs.dot(model.transformer_.T)
     distances = np.sqrt(np.sum((embedded_pairs[:, 1] -
@@ -66,7 +66,7 @@ def test_score_pairs_finite(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   pairs = np.array(list(product(X, X)))
   assert np.isfinite(model.score_pairs(pairs)).all()
 
@@ -80,7 +80,7 @@ def test_score_pairs_dim(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   tuples = np.array(list(product(X, X)))
   assert model.score_pairs(tuples).shape == (tuples.shape[0],)
   context = make_context(estimator)
@@ -111,7 +111,7 @@ def test_embed_toy_example(estimator, build_dataset):
     X = X[:n_samples]
     model = clone(estimator)
     set_random_state(model)
-    model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+    model.fit(*remove_y_quadruplets(estimator, input_data, labels))
     embedded_points = X.dot(model.transformer_.T)
     assert_array_almost_equal(model.transform(X), embedded_points)
 
@@ -123,7 +123,7 @@ def test_embed_dim(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   assert model.transform(X).shape == X.shape
 
   # assert that ValueError is thrown if input shape is 1D
@@ -140,7 +140,7 @@ def test_embed_dim(estimator, build_dataset):
     #  avoid this enumeration and rather test if hasattr n_components
     #  as soon as we have made the arguments names as such (issue #167)
     model.set_params(num_dims=2)
-    model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+    model.fit(*remove_y_quadruplets(estimator, input_data, labels))
     assert model.transform(X).shape == (X.shape[0], 2)
     # assert that ValueError is thrown if input shape is 1D
     with pytest.raises(ValueError) as raised_error:
@@ -155,7 +155,7 @@ def test_embed_finite(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   assert np.isfinite(model.transform(X)).all()
 
 
@@ -166,7 +166,7 @@ def test_embed_is_linear(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   assert_array_almost_equal(model.transform(X[:10] + X[10:20]),
                             model.transform(X[:10]) +
                             model.transform(X[10:20]))
@@ -185,7 +185,7 @@ def test_get_metric_equivalent_to_explicit_mahalanobis(estimator,
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   metric = model.get_metric()
   n_features = X.shape[1]
   a, b = (rng.randn(n_features), rng.randn(n_features))
@@ -204,7 +204,7 @@ def test_get_metric_is_pseudo_metric(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   metric = model.get_metric()
 
   n_features = X.shape[1]
@@ -230,7 +230,7 @@ def test_metric_raises_deprecation_warning(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
 
   with pytest.warns(DeprecationWarning) as raised_warning:
     model.metric()
@@ -247,7 +247,7 @@ def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   clustering = DBSCAN(metric=model.get_metric())
   clustering.fit(X)
 
@@ -260,7 +260,7 @@ def test_get_squared_metric(estimator, build_dataset):
   input_data, labels, _, X = build_dataset()
   model = clone(estimator)
   set_random_state(model)
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   metric = model.get_metric()
 
   n_features = X.shape[1]
@@ -280,10 +280,10 @@ def test_transformer_is_2D(estimator, build_dataset):
   model = clone(estimator)
   set_random_state(model)
   # test that it works for X.shape[1] features
-  model.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, input_data, labels))
   assert model.transformer_.shape == (X.shape[1], X.shape[1])
 
   # test that it works for 1 feature
   trunc_data = input_data[..., :1]
-  model.fit(*make_args_inc_quadruplets(estimator, trunc_data, labels))
+  model.fit(*remove_y_quadruplets(estimator, trunc_data, labels))
   assert model.transformer_.shape == (1, 1)  # the transformer must be 2D
diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py
index 5fc11aeb..7a51ee68 100644
--- a/test/test_sklearn_compat.py
+++ b/test/test_sklearn_compat.py
@@ -20,7 +20,7 @@
 from test.test_utils import (metric_learners, ids_metric_learners,
                              mock_preprocessor, tuples_learners,
                              ids_tuples_learners, pairs_learners,
-                             ids_pairs_learners, make_args_inc_quadruplets,
+                             ids_pairs_learners, remove_y_quadruplets,
                              quadruplets_learners)
 
 
@@ -140,13 +140,13 @@ def test_cross_validation_is_finite(estimator, build_dataset):
   estimator.set_params(preprocessor=preprocessor)
   set_random_state(estimator)
   assert np.isfinite(cross_val_score(estimator,
-                                     *make_args_inc_quadruplets(estimator,
-                                                                input_data,
-                                                                labels))).all()
+                                     *remove_y_quadruplets(estimator,
+                                                           input_data,
+                                                           labels))).all()
   assert np.isfinite(cross_val_predict(estimator,
-                                       *make_args_inc_quadruplets(estimator,
-                                                                  input_data,
-                                                                  labels)
+                                       *remove_y_quadruplets(estimator,
+                                                             input_data,
+                                                             labels)
                                        )).all()
 
 
@@ -178,25 +178,28 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset,
       train_mask = np.ones(input_data.shape[0], bool)
       train_mask[test_slice] = False
       y_train, y_test = labels[train_mask], labels[test_slice]
-      estimator.fit(*make_args_inc_quadruplets(estimator,
-                                               input_data[train_mask],
-                                               y_train))
+      estimator.fit(*remove_y_quadruplets(estimator,
+                                          input_data[train_mask],
+                                          y_train))
       if hasattr(estimator, "score"):
-        scores.append(estimator.score(*make_args_inc_quadruplets(estimator,
-          input_data[test_slice], y_test)))
+        scores.append(estimator.score(*remove_y_quadruplets(
+            estimator, input_data[test_slice], y_test)))
       if hasattr(estimator, "predict"):
         predictions[test_slice] = estimator.predict(input_data[test_slice])
     if hasattr(estimator, "score"):
-      assert all(scores == cross_val_score(estimator,
-        *make_args_inc_quadruplets(estimator, input_data, labels), cv=kfold))
+      assert all(scores == cross_val_score(
+          estimator, *remove_y_quadruplets(estimator, input_data, labels),
+          cv=kfold))
     if hasattr(estimator, "predict"):
-      assert all(predictions == cross_val_predict(estimator,
-        *make_args_inc_quadruplets(estimator, input_data, labels), cv=kfold))
+      assert all(predictions == cross_val_predict(
+          estimator,
+          *remove_y_quadruplets(estimator, input_data, labels),
+          cv=kfold))
 
 
 def check_score(estimator, tuples, y):
   if hasattr(estimator, "score"):
-    score = estimator.score(*make_args_inc_quadruplets(estimator, tuples, y))
+    score = estimator.score(*remove_y_quadruplets(estimator, tuples, y))
     assert np.isfinite(score)
 
 
@@ -220,7 +223,7 @@ def test_simple_estimator(estimator, build_dataset, with_preprocessor):
     estimator.set_params(preprocessor=preprocessor)
     set_random_state(estimator)
 
-    estimator.fit(*make_args_inc_quadruplets(estimator, tuples_train, y_train))
+    estimator.fit(*remove_y_quadruplets(estimator, tuples_train, y_train))
     check_score(estimator, tuples_test, y_test)
     check_predict(estimator, tuples_test)
 
@@ -267,9 +270,9 @@ def test_estimators_fit_returns_self(estimator, build_dataset,
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)
-  assert estimator.fit(*make_args_inc_quadruplets(estimator,
-                                                  input_data,
-                                                  labels)) is estimator
+  assert estimator.fit(*remove_y_quadruplets(estimator,
+                                             input_data,
+                                             labels)) is estimator
 
 
 @pytest.mark.parametrize('with_preprocessor', [True, False])
@@ -299,18 +302,18 @@ def make_random_state(estimator, in_pipeline):
     estimator = clone(estimator)
     estimator.set_params(preprocessor=preprocessor)
     pipeline = make_pipeline(estimator)
-    estimator.fit(*make_args_inc_quadruplets(estimator, input_data, y),
+    estimator.fit(*remove_y_quadruplets(estimator, input_data, y),
                   **make_random_state(estimator, False))
-    pipeline.fit(*make_args_inc_quadruplets(estimator, input_data, y),
+    pipeline.fit(*remove_y_quadruplets(estimator, input_data, y),
                  **make_random_state(estimator, True))
 
     if hasattr(estimator, 'score'):
-      result = estimator.score(*make_args_inc_quadruplets(estimator,
-                                                          input_data,
-                                                          y))
-      result_pipe = pipeline.score(*make_args_inc_quadruplets(estimator,
-                                                              input_data,
-                                                              y))
+      result = estimator.score(*remove_y_quadruplets(estimator,
+                                                     input_data,
+                                                     y))
+      result_pipe = pipeline.score(*remove_y_quadruplets(estimator,
+                                                         input_data,
+                                                         y))
       assert_allclose_dense_sparse(result, result_pipe)
 
     if hasattr(estimator, 'predict'):
@@ -336,7 +339,7 @@ def test_dict_unchanged(estimator, build_dataset, with_preprocessor):
   estimator.set_params(preprocessor=preprocessor)
   if hasattr(estimator, "num_dims"):
     estimator.num_dims = 1
-  estimator.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  estimator.fit(*remove_y_quadruplets(estimator, input_data, labels))
 
   def check_dict():
     assert estimator.__dict__ == dict_before, (
@@ -367,7 +370,7 @@ def test_dont_overwrite_parameters(estimator, build_dataset,
     estimator.num_dims = 1
   dict_before_fit = estimator.__dict__.copy()
 
-  estimator.fit(*make_args_inc_quadruplets(estimator, input_data, labels))
+  estimator.fit(*remove_y_quadruplets(estimator, input_data, labels))
   dict_after_fit = estimator.__dict__
 
   public_keys_after_fit = [key for key in dict_after_fit.keys()
diff --git a/test/test_utils.py b/test/test_utils.py
index bc5ea59b..821091a4 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -143,7 +143,7 @@ def build_quadruplets(with_preprocessor=False):
 ids_metric_learners = ids_tuples_learners + ids_supervised_learners
 
 
-def make_args_inc_quadruplets(estimator, X, y):
+def remove_y_quadruplets(estimator, X, y):
   """Quadruplets learners have no y in fit, but to write test for all
   estimators, it is convenient to have this function, that will return X and y
   if the estimator needs a y to fit on, and just X otherwise."""
@@ -856,7 +856,7 @@ def test_error_message_tuple_size(estimator):
                             [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]])
   y = [1, 1]
   with pytest.raises(ValueError) as raised_err:
-    estimator.fit(*make_args_inc_quadruplets(estimator, invalid_pairs, y))
+    estimator.fit(*remove_y_quadruplets(estimator, invalid_pairs, y))
   expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 3 "
                   "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n"
                   .format(estimator._tuple_size, make_context(estimator),
@@ -941,25 +941,25 @@ def make_random_state(estimator):
   estimator_with_preprocessor = clone(estimator)
   set_random_state(estimator_with_preprocessor)
   estimator_with_preprocessor.set_params(preprocessor=X)
-  estimator_with_preprocessor.fit(*make_args_inc_quadruplets(estimator,
-                                                             indices_train,
-                                                             y_train),
+  estimator_with_preprocessor.fit(*remove_y_quadruplets(estimator,
+                                                        indices_train,
+                                                        y_train),
                                   **make_random_state(estimator))
 
   estimator_without_preprocessor = clone(estimator)
   set_random_state(estimator_without_preprocessor)
   estimator_without_preprocessor.set_params(preprocessor=None)
-  estimator_without_preprocessor.fit(*make_args_inc_quadruplets(estimator,
-                                                                formed_train,
-                                                                y_train),
+  estimator_without_preprocessor.fit(*remove_y_quadruplets(estimator,
+                                                           formed_train,
+                                                           y_train),
                                      **make_random_state(estimator))
 
   estimator_with_prep_formed = clone(estimator)
   set_random_state(estimator_with_prep_formed)
   estimator_with_prep_formed.set_params(preprocessor=X)
-  estimator_with_prep_formed.fit(*make_args_inc_quadruplets(estimator,
-                                                            indices_train,
-                                                            y_train),
+  estimator_with_prep_formed.fit(*remove_y_quadruplets(estimator,
+                                                       indices_train,
+                                                       y_train),
                                  **make_random_state(estimator))
 
   # test prediction methods

From facc546c2f2485e5efa2f98c86b62b5cdc98dea7 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 13:40:22 +0100
Subject: [PATCH 25/41] TST: Fix remaining threshold into min_rate

---
 test/test_pairs_classifiers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index d467e965..f85e6125 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -231,7 +231,7 @@ def test_found_score_is_best_score(kwargs, scoring):
                          [({'strategy': 'max_tpr', 'min_rate': t},
                            partial(tpr_threshold, tnr_threshold=t))
                           for t in [0., 0.1, 0.5, 0.8, 1.]] +
-                         [({'strategy': 'max_tnr', 'threshold': t},
+                         [({'strategy': 'max_tnr', 'min_rate': t},
                            partial(tnr_threshold, tpr_threshold=t))
                           for t in [0., 0.1, 0.5, 0.8, 1.]]
                          )

From f0ca65ec6ee340ba0309fe91a386d9186a19b5b5 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 13:44:53 +0100
Subject: [PATCH 26/41] Remove default_threshold and put calibrate_threshold
 instead

---
 metric_learn/base_metric.py    | 14 --------------
 metric_learn/mmc.py            |  2 +-
 metric_learn/sdml.py           |  2 +-
 test/test_pairs_classifiers.py | 10 ----------
 4 files changed, 2 insertions(+), 26 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 77672bba..78450941 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -381,20 +381,6 @@ def score(self, pairs, y):
     """
     return roc_auc_score(y, self.decision_function(pairs))
 
-  def set_default_threshold(self, pairs, y):
-    """Sets the default threshold on the given dataset.
-
-    Returns a threshold that is the mean between the similar
-    metrics mean and the dissimilar metrics mean.
-
-    See more in the :ref:`User Guide <calibration>`.
-    """
-    similar_threshold = np.mean(self.score_pairs(
-        pairs[(y == 1).ravel()]))
-    dissimilar_threshold = np.mean(self.score_pairs(
-        pairs[(y == -1).ravel()]))
-    self.threshold_ = np.mean([similar_threshold, dissimilar_threshold])
-
   def set_threshold(self, threshold):
     """Sets the threshold of the metric learner to the given value `threshold`.
 
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 2ddcced2..0574f572 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -385,7 +385,7 @@ def fit(self, pairs, y):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.set_default_threshold(pairs, y)
+    self.calibrate_threshold(pairs, y)
     return self
 
 
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index 096dc0ed..d11bf3cc 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -107,7 +107,7 @@ def fit(self, pairs, y):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.set_default_threshold(pairs, y)
+    self.calibrate_threshold(pairs, y)
     return self
 
 
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index f85e6125..eb3e6b4b 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -120,16 +120,6 @@ def test_set_threshold():
   assert identity_pairs_classifier.threshold_ == 0.5
 
 
-def test_set_default_threshold_toy_example():
-  # test that the default threshold has the right value on a toy example
-  identity_pairs_classifier = IdentityPairsClassifier()
-  pairs = np.array([[[0.], [1.]], [[1.], [3.]], [[2.], [5.]], [[3.], [7.]]])
-  y = np.array([1, 1, -1, -1])
-  identity_pairs_classifier.fit(pairs, y)
-  identity_pairs_classifier.set_default_threshold(pairs, y)
-  assert identity_pairs_classifier.threshold_ == 2.5
-
-
 def test_f_beta_1_is_f_1():
   # test that putting beta to 1 indeed finds the best threshold to optimize
   # the f1_score

From a6ec2831b22e9de3efe51f83f0ef5cc3af1659db Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 13:54:31 +0100
Subject: [PATCH 27/41] Use calibrate_threshold for ITML, and remove
 description

---
 metric_learn/itml.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 57df1471..9ce736f9 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -158,8 +158,6 @@ class ITML(_BaseITML, _PairsClassifierMixin):
   def fit(self, pairs, y, bounds=None):
     """Learn the ITML model.
 
-    The default threshold will be set to the mean of the bounds.
-
     Parameters
     ----------
     pairs: array-like, shape=(n_constraints, 2, n_features) or
@@ -184,7 +182,7 @@ def fit(self, pairs, y, bounds=None):
         Returns the instance.
     """
     self._fit(pairs, y, bounds=bounds)
-    self.threshold_ = np.mean(self.bounds_)
+    self.threshold_ = self.calibrate_threshold(pairs, y)
     return self
 
 

From 49fbbd783e7bbc29574584944c8cba5813a6c855 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 14:14:13 +0100
Subject: [PATCH 28/41] ENH: use calibrate_threshold by default and display its
 parameters from the fit method

---
 metric_learn/itml.py | 13 +++++++++++--
 metric_learn/mmc.py  | 18 +++++++++++++-----
 metric_learn/sdml.py | 17 +++++++++++++----
 3 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 9ce736f9..09ee0bbe 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -155,9 +155,12 @@ class ITML(_BaseITML, _PairsClassifierMixin):
       classified as dissimilar.
   """
 
-  def fit(self, pairs, y, bounds=None):
+  def fit(self, pairs, y, bounds=None, threshold_params=None):
     """Learn the ITML model.
 
+    The threshold will be calibrated on the trainset using the parameters
+    `threshold_params`.
+
     Parameters
     ----------
     pairs: array-like, shape=(n_constraints, 2, n_features) or
@@ -175,6 +178,10 @@ def fit(self, pairs, y, bounds=None):
         If not provided at initialization, bounds_[0] and bounds_[1] will be
         set to the 5th and 95th percentile of the pairwise distances among all
         points present in the input `pairs`.
+    threshold_params : `dict` or `None`
+        Dictionary of parameters to give to `calibrate_threshold` for the
+        threshold calibration step done at the end of `fit`. If `None` is
+        given, `calibrate_threshold` will use the default parameters.
 
     Returns
     -------
@@ -182,7 +189,9 @@ def fit(self, pairs, y, bounds=None):
         Returns the instance.
     """
     self._fit(pairs, y, bounds=bounds)
-    self.threshold_ = self.calibrate_threshold(pairs, y)
+    self.calibrate_threshold(pairs, y, **(threshold_params if
+                                          threshold_params is not None else
+                                          dict()))
     return self
 
 
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 0574f572..30e935d2 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -366,26 +366,34 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
       classified as dissimilar.
   """
 
-  def fit(self, pairs, y):
+  def fit(self, pairs, y, threshold_params=None):
     """Learn the MMC model.
 
+    The threshold will be calibrated on the trainset using the parameters
+    `threshold_params`.
+
     Parameters
     ----------
-    pairs: array-like, shape=(n_constraints, 2, n_features) or
+    pairs : array-like, shape=(n_constraints, 2, n_features) or
            (n_constraints, 2)
         3D Array of pairs with each row corresponding to two points,
         or 2D array of indices of pairs if the metric learner uses a
         preprocessor.
-    y: array-like, of shape (n_constraints,)
+    y : array-like, of shape (n_constraints,)
         Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
-
+    threshold_params : `dict` or `None`
+        Dictionary of parameters to give to `calibrate_threshold` for the
+        threshold calibration step done at the end of `fit`. If `None` is
+        given, `calibrate_threshold` will use the default parameters.
     Returns
     -------
     self : object
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.calibrate_threshold(pairs, y)
+    self.calibrate_threshold(pairs, y, **(threshold_params if
+                                          threshold_params is not None else
+                                          dict()))
     return self
 
 
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index d11bf3cc..ee268ad5 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -88,18 +88,25 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
       classified as dissimilar.
   """
 
-  def fit(self, pairs, y):
+  def fit(self, pairs, y, threshold_params=None):
     """Learn the SDML model.
 
+    The threshold will be calibrated on the trainset using the parameters
+    `threshold_params`.
+
     Parameters
     ----------
-    pairs: array-like, shape=(n_constraints, 2, n_features) or
+    pairs : array-like, shape=(n_constraints, 2, n_features) or
            (n_constraints, 2)
         3D Array of pairs with each row corresponding to two points,
         or 2D array of indices of pairs if the metric learner uses a
         preprocessor.
-    y: array-like, of shape (n_constraints,)
+    y : array-like, of shape (n_constraints,)
         Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
+    threshold_params : `dict` or `None`
+        Dictionary of parameters to give to `calibrate_threshold` for the
+        threshold calibration step done at the end of `fit`. If `None` is
+        given, `calibrate_threshold` will use the default parameters.
 
     Returns
     -------
@@ -107,7 +114,9 @@ def fit(self, pairs, y):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.calibrate_threshold(pairs, y)
+    self.calibrate_threshold(pairs, y, **(threshold_params if
+                                          threshold_params is not None else
+                                          dict()))
     return self
 
 

From 960b1748fbe8c103a6ffebd50dfa78f9d5518afa Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 14:30:13 +0100
Subject: [PATCH 29/41] Add a small test to test automatic calibration

---
 test/test_pairs_classifiers.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index eb3e6b4b..ae37a1f0 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -73,6 +73,29 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
     estimator.predict(input_data)
 
 
+@pytest.mark.parametrize('threshold_param',
+                         [None, {}, dict(), {'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
+                              ['max_tpr', 'max_tnr'], [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [0., 0.1, 0.2, 1., 5.]]
+                         )
+@pytest.mark.parametrize('with_preprocessor', [True, False])
+@pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
+                         ids=ids_pairs_learners)
+def test_fit_with_valid_threshold_params(estimator, build_dataset,
+                                         with_preprocessor, threshold_param):
+  """Tests that fitting `threshold_params` with appropriate parameters works
+  as expected"""
+  pairs, y, preprocessor, _ = build_dataset(with_preprocessor)
+  estimator = clone(estimator)
+  estimator.set_params(preprocessor=preprocessor)
+  set_random_state(estimator)
+  estimator.fit(pairs, y, threshold_params=threshold_param)
+  estimator.predict(pairs)
+
+
 @pytest.mark.parametrize('kwargs',
                          [{'strategy': 'accuracy'}] +
                          [{'strategy': strategy, 'min_rate': min_rate}
@@ -84,9 +107,9 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
 @pytest.mark.parametrize('with_preprocessor', [True, False])
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)
-def test_min_rate_different_scores_is_finite(estimator, build_dataset,
-                                             with_preprocessor, kwargs):
-  # test that the score returned is finite for every metric learner
+def test_threshold_different_scores_is_finite(estimator, build_dataset,
+                                              with_preprocessor, kwargs):
+  # test that calibrating the threshold works for every metric learner
   input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)

From c91acf787bd5841f034c1de00991ef2a35c4936a Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 14:44:10 +0100
Subject: [PATCH 30/41] Update documentation of the default threshold

---
 doc/weakly_supervised.rst | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/doc/weakly_supervised.rst b/doc/weakly_supervised.rst
index 2868f9e8..6bf6f993 100644
--- a/doc/weakly_supervised.rst
+++ b/doc/weakly_supervised.rst
@@ -168,16 +168,20 @@ learned space) than this threshold are predicted as similar, and points further
 away are predicted as dissimilar. Several methods are possible for this
 thresholding.
 
-- **default**: Unless explicitely stated in the `fit` method documentation
-  of the estimator, the threshold is set with the method
-  `set_default_threshold` on the trainset.
-
-- **manual**: calling `set_threshold` will set the threshold to a
+- **At fit time**: The threshold is set with `calibrate_threshold` (see
+  below) on the trainset. You can specify the calibration parameters directly
+  in the `fit` method with the `threshold_params` parameter (see the
+  documentation of the `fit` method of any metric learner that learns on pairs
+  of points for more information). This method can cause a little bit of
+  overfitting. If you want to avoid that, calibrate the threshold after
+  fitting, on a validation set.
+
+- **Manual**: calling `set_threshold` will set the threshold to a
   particular value.
 
-- **calibrated**: calling `calibrate_threshold` will calibrate the threshold to
-  achieve a particular score on a validation set, the score being among the
-  classical scores for classification (accuracy, f1 score...).
+- **Calibration**: calling `calibrate_threshold` will calibrate the
+  threshold to achieve a particular score on a validation set, the score
+  being among the classical scores for classification (accuracy, f1 score...).
 
 
 See also: `sklearn.calibration`.

From a742186bf564805436c6536cc85199f68d45d5b2 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 15:51:58 +0100
Subject: [PATCH 31/41] Inverse sense for threshold comparison to be more
 intuitive

---
 metric_learn/base_metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 78450941..f65c1f96 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -326,7 +326,7 @@ def predict(self, pairs):
       The predicted learned metric value between samples in every pair.
     """
     check_is_fitted(self, ['threshold_', 'transformer_'])
-    return 2 * (self.decision_function(pairs) >= - self.threshold_) - 1
+    return 2 * (- self.decision_function(pairs) <= self.threshold_) - 1
 
   def decision_function(self, pairs):
     """Returns the decision function used to classify the pairs.

From 9ec1ead7b0fa04ff08c563f42fdcf92082ecfc98 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Thu, 21 Mar 2019 15:58:48 +0100
Subject: [PATCH 32/41] Address remaining review comments

---
 metric_learn/base_metric.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index f65c1f96..ac850c4f 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -504,9 +504,15 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
     if strategy == 'f_beta':
       precision, recall, thresholds = precision_recall_curve(
           y_valid, self.decision_function(pairs_valid), pos_label=1)
+      # We ignore the warnings here, in the same taste as
+      # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065
+      # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284
       with np.errstate(divide='ignore', invalid='ignore'):
         f_beta = ((1 + beta**2) * (precision * recall) /
                   (beta**2 * precision + recall))
+      # We need to set nans to zero otherwise they will be considered higher
+      # than the others (also discussed in https://github.com/scikit-learn/
+      # scikit-learn/pull/10117/files#r262115773)
       f_beta[np.isnan(f_beta)] = 0.
       imax = np.argmax(f_beta)
       # note: we want a positive threshold (distance), so we take - threshold

From 986fed31148c28aa29587164bde7c955876ee3d1 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Mar 2019 10:06:19 +0100
Subject: [PATCH 33/41] MAINT: Rename threshold_params into calibration_params

---
 metric_learn/itml.py | 10 +++++-----
 metric_learn/mmc.py  | 10 +++++-----
 metric_learn/sdml.py | 10 +++++-----
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 09ee0bbe..76b790f3 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -155,11 +155,11 @@ class ITML(_BaseITML, _PairsClassifierMixin):
       classified as dissimilar.
   """
 
-  def fit(self, pairs, y, bounds=None, threshold_params=None):
+  def fit(self, pairs, y, bounds=None, calibration_params=None):
     """Learn the ITML model.
 
     The threshold will be calibrated on the trainset using the parameters
-    `threshold_params`.
+    `calibration_params`.
 
     Parameters
     ----------
@@ -178,7 +178,7 @@ def fit(self, pairs, y, bounds=None, threshold_params=None):
         If not provided at initialization, bounds_[0] and bounds_[1] will be
         set to the 5th and 95th percentile of the pairwise distances among all
         points present in the input `pairs`.
-    threshold_params : `dict` or `None`
+    calibration_params : `dict` or `None`
         Dictionary of parameters to give to `calibrate_threshold` for the
         threshold calibration step done at the end of `fit`. If `None` is
         given, `calibrate_threshold` will use the default parameters.
@@ -189,8 +189,8 @@ def fit(self, pairs, y, bounds=None, threshold_params=None):
         Returns the instance.
     """
     self._fit(pairs, y, bounds=bounds)
-    self.calibrate_threshold(pairs, y, **(threshold_params if
-                                          threshold_params is not None else
+    self.calibrate_threshold(pairs, y, **(calibration_params if
+                                          calibration_params is not None else
                                           dict()))
     return self
 
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 30e935d2..1d4e8fa6 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -366,11 +366,11 @@ class MMC(_BaseMMC, _PairsClassifierMixin):
       classified as dissimilar.
   """
 
-  def fit(self, pairs, y, threshold_params=None):
+  def fit(self, pairs, y, calibration_params=None):
     """Learn the MMC model.
 
     The threshold will be calibrated on the trainset using the parameters
-    `threshold_params`.
+    `calibration_params`.
 
     Parameters
     ----------
@@ -381,7 +381,7 @@ def fit(self, pairs, y, threshold_params=None):
         preprocessor.
     y : array-like, of shape (n_constraints,)
         Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
-    threshold_params : `dict` or `None`
+    calibration_params : `dict` or `None`
         Dictionary of parameters to give to `calibrate_threshold` for the
         threshold calibration step done at the end of `fit`. If `None` is
         given, `calibrate_threshold` will use the default parameters.
@@ -391,8 +391,8 @@ def fit(self, pairs, y, threshold_params=None):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.calibrate_threshold(pairs, y, **(threshold_params if
-                                          threshold_params is not None else
+    self.calibrate_threshold(pairs, y, **(calibration_params if
+                                          calibration_params is not None else
                                           dict()))
     return self
 
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index ee268ad5..0d24d096 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -88,11 +88,11 @@ class SDML(_BaseSDML, _PairsClassifierMixin):
       classified as dissimilar.
   """
 
-  def fit(self, pairs, y, threshold_params=None):
+  def fit(self, pairs, y, calibration_params=None):
     """Learn the SDML model.
 
     The threshold will be calibrated on the trainset using the parameters
-    `threshold_params`.
+    `calibration_params`.
 
     Parameters
     ----------
@@ -103,7 +103,7 @@ def fit(self, pairs, y, threshold_params=None):
         preprocessor.
     y : array-like, of shape (n_constraints,)
         Labels of constraints. Should be -1 for dissimilar pair, 1 for similar.
-    threshold_params : `dict` or `None`
+    calibration_params : `dict` or `None`
         Dictionary of parameters to give to `calibrate_threshold` for the
         threshold calibration step done at the end of `fit`. If `None` is
         given, `calibrate_threshold` will use the default parameters.
@@ -114,8 +114,8 @@ def fit(self, pairs, y, threshold_params=None):
         Returns the instance.
     """
     self._fit(pairs, y)
-    self.calibrate_threshold(pairs, y, **(threshold_params if
-                                          threshold_params is not None else
+    self.calibrate_threshold(pairs, y, **(calibration_params if
+                                          calibration_params is not None else
                                           dict()))
     return self
 

From 3f5d6d1447c1fc01ed51f2fc24daf7b52adcb1ba Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Mar 2019 12:06:25 +0100
Subject: [PATCH 34/41] TST: Add test for extreme cases

---
 test/test_pairs_classifiers.py | 78 ++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index ae37a1f0..d02a2078 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -330,3 +330,81 @@ def test_calibrate_threshold_valid_parameters(valid_args):
   with pytest.warns(None) as record:
     pairs_learner.calibrate_threshold(pairs, y, **valid_args)
   assert len(record) == 0
+
+
+def test_calibrate_threshold_extreme():
+  """Test that in the (rare) case where we should accept all points or
+  reject all points, this is effectively what
+  is done"""
+
+  class MockBadPairsClassifier(MahalanobisMixin, _PairsClassifierMixin):
+    """A pairs classifier that returns bad scores (i.e. in the inverse order
+    of what we would expect from a good pairs classifier
+    """
+
+    def fit(self, pairs, y, calibration_params=None):
+      self.transformer_ = 'not used'
+      self.calibrate_threshold(pairs, y, **(calibration_params if
+                                            calibration_params is not None else
+                                            dict()))
+      return self
+
+    def decision_function(self, pairs):
+      return np.arange(7)
+  rng = np.random.RandomState(42)
+  pairs = rng.randn(7, 2, 5)  # the info in X is not used, it's just for the
+  # API
+
+  y = [1, 1, 1, -1, -1, -1, -1]
+  mock_clf = MockBadPairsClassifier()
+  # case of bad scoring with more negative than positives. In
+  # this case, when:
+  # optimizing for accuracy we should reject all points
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'})
+  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+
+  # optimizing for max_tpr we should accept all points if min_rate == 0. (
+  # because by convention then tnr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr',
+                                             'min_rate': 0.})
+  assert (mock_clf.predict(pairs) == np.ones(7)).all()
+  # optimizing for max_tnr we should reject all points if min_rate = 0. (
+  # because by convention then tpr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr',
+                                             'min_rate': 0.})
+  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+
+  y = [1, 1, 1, 1, -1, -1, -1]
+  # case of bad scoring with more positives than negatives. In
+  # this case, when:
+  # optimizing for accuracy we should accept all points
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'})
+  assert (mock_clf.predict(pairs) == np.ones(7)).all()
+  # optimizing for max_tpr we should accept all points if min_rate == 0. (
+  # because by convention then tnr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr',
+                                             'min_rate': 0.})
+  assert (mock_clf.predict(pairs) == np.ones(7)).all()
+  # optimizing for max_tnr we should reject all points if min_rate = 0. (
+  # because by convention then tpr=0/0=0)
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr',
+                                             'min_rate': 0.})
+  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+
+  # Note: we'll never find a case where we would reject all points for
+  # maximizing tpr (we can always accept more points), and accept all
+  # points for maximizing tnr (we can always reject more points)
+
+  # case of alternated scores: for optimizing the f_1 score we should accept
+  # all points (because this way we have max recall (1) and max precision (
+  # here: 0.5))
+  y = [1, -1, 1, -1, 1, -1]
+  mock_clf.fit(pairs, y, calibration_params={'strategy': 'f_beta',
+                                             'beta': 1.})
+  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+
+  # Note: for optimizing f_1 score, we will never find an optimal case where we
+  # reject all points because in this case we would have 0 precision (by
+  # convention, because it's 0/0), and 0 recall (and we could always decrease
+  # the threshold to increase the recall, and we couldn't do worse for
+  # precision so it would be better)

From 7b5e4ddd0cbf2489cd7a001e66121226cf32ed60 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Mar 2019 17:03:58 +0100
Subject: [PATCH 35/41] MAINT: rename threshold_params into calibration_params

---
 test/test_pairs_classifiers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index d02a2078..be780dc6 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -92,7 +92,7 @@ def test_fit_with_valid_threshold_params(estimator, build_dataset,
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)
   set_random_state(estimator)
-  estimator.fit(pairs, y, threshold_params=threshold_param)
+  estimator.fit(pairs, y, calibration_params=threshold_param)
   estimator.predict(pairs)
 
 

From a3ec02c364145691bdaff8c7beb723d4a2138870 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 26 Mar 2019 17:07:15 +0100
Subject: [PATCH 36/41] MAINT: rename threshold_params into calibration_params

---
 test/test_pairs_classifiers.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index be780dc6..70fb6c11 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -73,7 +73,7 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
     estimator.predict(input_data)
 
 
-@pytest.mark.parametrize('threshold_param',
+@pytest.mark.parametrize('calibration_params',
                          [None, {}, dict(), {'strategy': 'accuracy'}] +
                          [{'strategy': strategy, 'min_rate': min_rate}
                           for (strategy, min_rate) in product(
@@ -85,14 +85,15 @@ def test_raise_not_fitted_error_if_not_fitted(estimator, build_dataset,
 @pytest.mark.parametrize('estimator, build_dataset', pairs_learners,
                          ids=ids_pairs_learners)
 def test_fit_with_valid_threshold_params(estimator, build_dataset,
-                                         with_preprocessor, threshold_param):
-  """Tests that fitting `threshold_params` with appropriate parameters works
+                                         with_preprocessor,
+                                         calibration_params):
+  """Tests that fitting `calibration_params` with appropriate parameters works
   as expected"""
   pairs, y, preprocessor, _ = build_dataset(with_preprocessor)
   estimator = clone(estimator)
   estimator.set_params(preprocessor=preprocessor)
   set_random_state(estimator)
-  estimator.fit(pairs, y, calibration_params=threshold_param)
+  estimator.fit(pairs, y, calibration_params=calibration_params)
   estimator.predict(pairs)
 
 

From ccc66eba9df2c639a86dcda1d725875b7840944b Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Wed, 27 Mar 2019 11:30:35 +0100
Subject: [PATCH 37/41] FIX: Make tests work, and add the right threshold (mean
 between lowest accepted value and highest rejected value), and max + 1 or min
 - 1 for extreme points

---
 metric_learn/base_metric.py    | 71 ++++++++++++++++++++++++++--------
 test/test_pairs_classifiers.py | 29 +++++++-------
 2 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index ac850c4f..27c294df 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -1,7 +1,8 @@
 from sklearn.base import BaseEstimator
+from sklearn.metrics.ranking import _binary_clf_curve
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.validation import _is_arraylike, check_is_fitted
-from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve
+from sklearn.metrics import roc_auc_score, roc_curve
 import numpy as np
 from abc import ABCMeta, abstractmethod
 import six
@@ -490,20 +491,39 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       scores_sorted = scores[scores_sorted_idces]
       # true labels ordered by decision_function value: (higher first)
       y_ordered = y_valid[scores_sorted_idces]
+      # we need to add a threshold that will reject all points
+      scores_sorted = np.concatenate([[scores_sorted[0] + 1], scores_sorted])
+
       # finds the threshold that maximizes the accuracy:
       cum_tp = stable_cumsum(y_ordered == 1)  # cumulative number of true
       # positives
+      # we need to add the point where all samples are rejected:
+      cum_tp = np.concatenate([[0.], cum_tp])
       cum_tn_inverted = stable_cumsum(y_ordered[::-1] == -1)
-      cum_tn = np.concatenate([[0], cum_tn_inverted[:-1]])[::-1]
+      cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1]
       cum_accuracy = (cum_tp + cum_tn) / n_samples
       imax = np.argmax(cum_accuracy)
       # note: we want a positive threshold (distance), so we take - threshold
-      self.threshold_ = - scores_sorted[imax]
+      if imax == len(scores_sorted):  # if the best is to accept all points
+        # we set the threshold to (minus) [the lowest score - 1]
+        self.threshold_ = - (scores_sorted[imax] - 1)
+      else:
+        # otherwise, we set the threshold to the mean between the lowest
+        # accepted score and the highest accepted score
+        self.threshold_ = - np.mean(scores_sorted[imax: imax + 2])
+      # note: if the best is to reject all points it's already one of the
+      # thresholds (scores_sorted[0] + 1)
       return self
 
     if strategy == 'f_beta':
-      precision, recall, thresholds = precision_recall_curve(
+      fps, tps, thresholds = _binary_clf_curve(
           y_valid, self.decision_function(pairs_valid), pos_label=1)
+
+      precision = tps / (tps + fps)
+      precision[np.isnan(precision)] = 0
+      recall = tps / tps[-1]
+
+      # here the thresholds are decreasing
       # We ignore the warnings here, in the same taste as
       # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065
       # f4332fd74eee57fcf73/sklearn/metrics/classification.py#L1284
@@ -516,26 +536,45 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       f_beta[np.isnan(f_beta)] = 0.
       imax = np.argmax(f_beta)
       # note: we want a positive threshold (distance), so we take - threshold
-      self.threshold_ = - thresholds[imax]
+      if imax == len(thresholds):  # the best is to accept all points
+        # we set the threshold to (minus) [the lowest score - 1]
+        self.threshold_ = - (thresholds[imax] - 1)
+      else:
+        # otherwise, we set the threshold to the mean between the lowest
+        # accepted score and the highest rejected score
+        self.threshold_ = - np.mean(thresholds[imax: imax + 2])
+      # Note: we don't need to deal with rejecting all points (i.e. threshold =
+      # max_scores + 1), since this can never happen to be optimal
+      # (see a more detailed discussion in test_calibrate_threshold_extreme)
       return self
 
     fpr, tpr, thresholds = roc_curve(y_valid,
                                      self.decision_function(pairs_valid),
-                                     pos_label=1)
+                                     pos_label=1, drop_intermediate=False)
+    # here the thresholds are decreasing
     fpr, tpr, thresholds = fpr, tpr, thresholds
 
-    if strategy == 'max_tpr':
-      indices = np.where(1 - fpr >= min_rate)[0]
-      max_tpr_index = np.argmax(tpr[indices])
-      # note: we want a positive threshold (distance), so we take - threshold
-      self.threshold_ = - thresholds[indices[max_tpr_index]]
+    if strategy in ['max_tpr', 'max_tnr']:
+      if strategy == 'max_tpr':
+        indices = np.where(1 - fpr >= min_rate)[0]
+        imax = np.argmax(tpr[indices])
 
-    if strategy == 'max_tnr':
-      indices = np.where(tpr >= min_rate)[0]
-      max_tnr_index = np.argmax(1 - fpr[indices])
+      if strategy == 'max_tnr':
+        indices = np.where(tpr >= min_rate)[0]
+        imax = np.argmax(1 - fpr[indices])
+
+      imax_valid = indices[imax]
       # note: we want a positive threshold (distance), so we take - threshold
-      self.threshold_ = - thresholds[indices[max_tnr_index]]
-    return self
+      if indices[imax] == len(thresholds):  # we want to accept everything
+        self.threshold_ = - (thresholds[imax_valid] - 1)
+      elif indices[imax] == 0:  # we want to reject everything
+        # thanks to roc_curve, the first point should be always max_threshold
+        # + 1 (we should always go through the "if" statement in roc_curve),
+        # see: https://github.com/scikit-learn/scikit-learn/pull/13523
+        self.threshold_ = - (thresholds[imax_valid])
+      else:
+        self.threshold_ = - np.mean(thresholds[imax_valid: imax_valid + 2])
+      return self
 
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index 70fb6c11..6c1d584e 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -3,6 +3,8 @@
 from functools import partial
 
 import pytest
+from numpy.testing import assert_array_equal
+
 from metric_learn.base_metric import _PairsClassifierMixin, MahalanobisMixin
 from sklearn.exceptions import NotFittedError
 from sklearn.metrics import (f1_score, accuracy_score, fbeta_score,
@@ -351,46 +353,47 @@ def fit(self, pairs, y, calibration_params=None):
       return self
 
     def decision_function(self, pairs):
-      return np.arange(7)
+      return np.arange(pairs.shape[0], dtype=float)
+
   rng = np.random.RandomState(42)
   pairs = rng.randn(7, 2, 5)  # the info in X is not used, it's just for the
   # API
 
-  y = [1, 1, 1, -1, -1, -1, -1]
+  y = [1., 1., 1., -1., -1., -1., -1.]
   mock_clf = MockBadPairsClassifier()
   # case of bad scoring with more negative than positives. In
   # this case, when:
   # optimizing for accuracy we should reject all points
   mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'})
-  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+  assert_array_equal(mock_clf.predict(pairs), - np.ones(7))
 
   # optimizing for max_tpr we should accept all points if min_rate == 0. (
   # because by convention then tnr=0/0=0)
   mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr',
                                              'min_rate': 0.})
-  assert (mock_clf.predict(pairs) == np.ones(7)).all()
+  assert_array_equal(mock_clf.predict(pairs), np.ones(7))
   # optimizing for max_tnr we should reject all points if min_rate = 0. (
   # because by convention then tpr=0/0=0)
   mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr',
                                              'min_rate': 0.})
-  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+  assert_array_equal(mock_clf.predict(pairs), - np.ones(7))
 
-  y = [1, 1, 1, 1, -1, -1, -1]
+  y = [1., 1., 1., 1., -1., -1., -1.]
   # case of bad scoring with more positives than negatives. In
   # this case, when:
   # optimizing for accuracy we should accept all points
   mock_clf.fit(pairs, y, calibration_params={'strategy': 'accuracy'})
-  assert (mock_clf.predict(pairs) == np.ones(7)).all()
+  assert_array_equal(mock_clf.predict(pairs), np.ones(7))
   # optimizing for max_tpr we should accept all points if min_rate == 0. (
   # because by convention then tnr=0/0=0)
   mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tpr',
                                              'min_rate': 0.})
-  assert (mock_clf.predict(pairs) == np.ones(7)).all()
+  assert_array_equal(mock_clf.predict(pairs), np.ones(7))
   # optimizing for max_tnr we should reject all points if min_rate = 0. (
   # because by convention then tpr=0/0=0)
   mock_clf.fit(pairs, y, calibration_params={'strategy': 'max_tnr',
                                              'min_rate': 0.})
-  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+  assert_array_equal(mock_clf.predict(pairs), - np.ones(7))
 
   # Note: we'll never find a case where we would reject all points for
   # maximizing tpr (we can always accept more points), and accept all
@@ -399,10 +402,10 @@ def decision_function(self, pairs):
   # case of alternated scores: for optimizing the f_1 score we should accept
   # all points (because this way we have max recall (1) and max precision (
   # here: 0.5))
-  y = [1, -1, 1, -1, 1, -1]
-  mock_clf.fit(pairs, y, calibration_params={'strategy': 'f_beta',
-                                             'beta': 1.})
-  assert (mock_clf.predict(pairs) == - np.ones(7)).all()
+  y = [1., -1., 1., -1., 1., -1.]
+  mock_clf.fit(pairs[:6], y, calibration_params={'strategy': 'f_beta',
+                                                 'beta': 1.})
+  assert_array_equal(mock_clf.predict(pairs[:6]), np.ones(6))
 
   # Note: for optimizing f_1 score, we will never find an optimal case where we
   # reject all points because in this case we would have 0 precision (by

From 719d0183e477fbf5cb841fe85b3c07155600e273 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 2 Apr 2019 13:54:37 +0200
Subject: [PATCH 38/41] Go back to previous version of finding the threshold

---
 metric_learn/base_metric.py | 40 ++++++++++---------------------------
 1 file changed, 11 insertions(+), 29 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 27c294df..83cefe85 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -1,8 +1,7 @@
 from sklearn.base import BaseEstimator
-from sklearn.metrics.ranking import _binary_clf_curve
 from sklearn.utils.extmath import stable_cumsum
 from sklearn.utils.validation import _is_arraylike, check_is_fitted
-from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
 import numpy as np
 from abc import ABCMeta, abstractmethod
 import six
@@ -503,26 +502,17 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       cum_tn = np.concatenate([[0.], cum_tn_inverted])[::-1]
       cum_accuracy = (cum_tp + cum_tn) / n_samples
       imax = np.argmax(cum_accuracy)
+      # we set the threshold to the lowest accepted score
       # note: we want a positive threshold (distance), so we take - threshold
-      if imax == len(scores_sorted):  # if the best is to accept all points
-        # we set the threshold to (minus) [the lowest score - 1]
-        self.threshold_ = - (scores_sorted[imax] - 1)
-      else:
-        # otherwise, we set the threshold to the mean between the lowest
-        # accepted score and the highest accepted score
-        self.threshold_ = - np.mean(scores_sorted[imax: imax + 2])
+      self.threshold_ = - scores_sorted[imax]
       # note: if the best is to reject all points it's already one of the
-      # thresholds (scores_sorted[0] + 1)
+      # thresholds (scores_sorted[0])
       return self
 
     if strategy == 'f_beta':
-      fps, tps, thresholds = _binary_clf_curve(
+      precision, recall, thresholds = precision_recall_curve(
           y_valid, self.decision_function(pairs_valid), pos_label=1)
 
-      precision = tps / (tps + fps)
-      precision[np.isnan(precision)] = 0
-      recall = tps / tps[-1]
-
       # here the thresholds are decreasing
       # We ignore the warnings here, in the same taste as
       # https://github.com/scikit-learn/scikit-learn/blob/62d205980446a1abc1065
@@ -535,14 +525,9 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       # scikit-learn/pull/10117/files#r262115773)
       f_beta[np.isnan(f_beta)] = 0.
       imax = np.argmax(f_beta)
+      # we set the threshold to the lowest accepted score
       # note: we want a positive threshold (distance), so we take - threshold
-      if imax == len(thresholds):  # the best is to accept all points
-        # we set the threshold to (minus) [the lowest score - 1]
-        self.threshold_ = - (thresholds[imax] - 1)
-      else:
-        # otherwise, we set the threshold to the mean between the lowest
-        # accepted score and the highest rejected score
-        self.threshold_ = - np.mean(thresholds[imax: imax + 2])
+      self.threshold_ = - thresholds[imax]
       # Note: we don't need to deal with rejecting all points (i.e. threshold =
       # max_scores + 1), since this can never happen to be optimal
       # (see a more detailed discussion in test_calibrate_threshold_extreme)
@@ -550,7 +535,7 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
 
     fpr, tpr, thresholds = roc_curve(y_valid,
                                      self.decision_function(pairs_valid),
-                                     pos_label=1, drop_intermediate=False)
+                                     pos_label=1)
     # here the thresholds are decreasing
     fpr, tpr, thresholds = fpr, tpr, thresholds
 
@@ -567,13 +552,10 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       # note: we want a positive threshold (distance), so we take - threshold
       if indices[imax] == len(thresholds):  # we want to accept everything
         self.threshold_ = - (thresholds[imax_valid] - 1)
-      elif indices[imax] == 0:  # we want to reject everything
-        # thanks to roc_curve, the first point should be always max_threshold
-        # + 1 (we should always go through the "if" statement in roc_curve),
-        # see: https://github.com/scikit-learn/scikit-learn/pull/13523
-        self.threshold_ = - (thresholds[imax_valid])
       else:
-        self.threshold_ = - np.mean(thresholds[imax_valid: imax_valid + 2])
+        # thanks to roc_curve, the first point will always be max_scores
+        # + 1, see: https://github.com/scikit-learn/scikit-learn/pull/13523
+        self.threshold_ = - thresholds[imax_valid]
       return self
 
 

From 551d1619fb430e6bf8a916a65069cebb4e147063 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 2 Apr 2019 13:59:20 +0200
Subject: [PATCH 39/41] Extract method for validating calibration parameters

---
 metric_learn/base_metric.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 83cefe85..56e667af 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -462,23 +462,7 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
     sklearn.calibration : scikit-learn's module for calibrating classifiers
     """
 
-    if strategy not in ('accuracy', 'f_beta', 'max_tpr',
-                        'max_tnr'):
-      raise ValueError('Strategy can either be "accuracy", "f_beta" or '
-                       '"max_tpr" or "max_tnr". Got "{}" instead.'
-                       .format(strategy))
-
-    if strategy == 'max_tpr' or strategy == 'max_tnr':
-      if (min_rate is None or not isinstance(min_rate, (int, float)) or
-              not min_rate >= 0 or not min_rate <= 1):
-        raise ValueError('Parameter min_rate must be a number in'
-                         '[0, 1]. '
-                         'Got {} instead.'.format(min_rate))
-
-    if strategy == 'f_beta':
-      if beta is None or not isinstance(beta, (int, float)):
-        raise ValueError('Parameter beta must be a real number. '
-                         'Got {} instead.'.format(type(beta)))
+    self._validate_calibration_parameters(beta, min_rate, strategy)
 
     pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid,
                                                 type_of_inputs='tuples')
@@ -558,6 +542,24 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
         self.threshold_ = - thresholds[imax_valid]
       return self
 
+  def _validate_calibration_parameters(self, beta, min_rate, strategy):
+    """Ensure that calibration parameters have allowed values"""
+    if strategy not in ('accuracy', 'f_beta', 'max_tpr',
+                        'max_tnr'):
+      raise ValueError('Strategy can either be "accuracy", "f_beta" or '
+                       '"max_tpr" or "max_tnr". Got "{}" instead.'
+                       .format(strategy))
+    if strategy == 'max_tpr' or strategy == 'max_tnr':
+      if (min_rate is None or not isinstance(min_rate, (int, float)) or
+              not min_rate >= 0 or not min_rate <= 1):
+        raise ValueError('Parameter min_rate must be a number in'
+                         '[0, 1]. '
+                         'Got {} instead.'.format(min_rate))
+    if strategy == 'f_beta':
+      if beta is None or not isinstance(beta, (int, float)):
+        raise ValueError('Parameter beta must be a real number. '
+                         'Got {} instead.'.format(type(beta)))
+
 
 class _QuadrupletsClassifierMixin(BaseMetricLearner):
 

From 594c485fe01dd4242ba0986059103f17c8d31071 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 2 Apr 2019 15:27:23 +0200
Subject: [PATCH 40/41] Validate calibration params before fit

---
 metric_learn/base_metric.py    |  6 ++-
 metric_learn/itml.py           |  9 ++--
 metric_learn/mmc.py            |  7 ++--
 metric_learn/sdml.py           |  7 ++--
 test/test_pairs_classifiers.py | 77 ++++++++++++++++++++++++++++++++++
 5 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index 56e667af..d4f00b13 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -462,7 +462,7 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
     sklearn.calibration : scikit-learn's module for calibrating classifiers
     """
 
-    self._validate_calibration_parameters(beta, min_rate, strategy)
+    self._validate_calibration_params(strategy, min_rate, beta)
 
     pairs_valid, y_valid = self._prepare_inputs(pairs_valid, y_valid,
                                                 type_of_inputs='tuples')
@@ -542,7 +542,9 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
         self.threshold_ = - thresholds[imax_valid]
       return self
 
-  def _validate_calibration_parameters(self, beta, min_rate, strategy):
+  @staticmethod
+  def _validate_calibration_params(strategy='accuracy', min_rate=None,
+                                   beta=1.):
     """Ensure that calibration parameters have allowed values"""
     if strategy not in ('accuracy', 'f_beta', 'max_tpr',
                         'max_tnr'):
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
index 76b790f3..9b6dccb2 100644
--- a/metric_learn/itml.py
+++ b/metric_learn/itml.py
@@ -188,10 +188,11 @@ def fit(self, pairs, y, bounds=None, calibration_params=None):
     self : object
         Returns the instance.
     """
-    self._fit(pairs, y, bounds=bounds)
-    self.calibrate_threshold(pairs, y, **(calibration_params if
-                                          calibration_params is not None else
-                                          dict()))
+    calibration_params = (calibration_params if calibration_params is not
+                          None else dict())
+    self._validate_calibration_params(**calibration_params)
+    self._fit(pairs, y)
+    self.calibrate_threshold(pairs, y, **calibration_params)
     return self
 
 
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
index 1d4e8fa6..346db2f8 100644
--- a/metric_learn/mmc.py
+++ b/metric_learn/mmc.py
@@ -390,10 +390,11 @@ def fit(self, pairs, y, calibration_params=None):
     self : object
         Returns the instance.
     """
+    calibration_params = (calibration_params if calibration_params is not
+                          None else dict())
+    self._validate_calibration_params(**calibration_params)
     self._fit(pairs, y)
-    self.calibrate_threshold(pairs, y, **(calibration_params if
-                                          calibration_params is not None else
-                                          dict()))
+    self.calibrate_threshold(pairs, y, **calibration_params)
     return self
 
 
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
index a07a8b49..e9828d07 100644
--- a/metric_learn/sdml.py
+++ b/metric_learn/sdml.py
@@ -170,10 +170,11 @@ def fit(self, pairs, y, calibration_params=None):
     self : object
         Returns the instance.
     """
+    calibration_params = (calibration_params if calibration_params is not
+                          None else dict())
+    self._validate_calibration_params(**calibration_params)
     self._fit(pairs, y)
-    self.calibrate_threshold(pairs, y, **(calibration_params if
-                                          calibration_params is not None else
-                                          dict()))
+    self.calibrate_threshold(pairs, y, **calibration_params)
     return self
 
 
diff --git a/test/test_pairs_classifiers.py b/test/test_pairs_classifiers.py
index 6c1d584e..828181cb 100644
--- a/test/test_pairs_classifiers.py
+++ b/test/test_pairs_classifiers.py
@@ -412,3 +412,80 @@ def decision_function(self, pairs):
   # convention, because it's 0/0), and 0 recall (and we could always decrease
   # the threshold to increase the recall, and we couldn't do worse for
   # precision so it would be better)
+
+
+@pytest.mark.parametrize('estimator, _',
+                         pairs_learners + [(IdentityPairsClassifier(), None),
+                                           (_PairsClassifierMixin, None)],
+                         ids=ids_pairs_learners + ['mock', 'class'])
+@pytest.mark.parametrize('invalid_args, expected_msg',
+                         [({'strategy': 'weird'},
+                           ('Strategy can either be "accuracy", "f_beta" or '
+                            '"max_tpr" or "max_tnr". Got "weird" instead.'))] +
+                         [({'strategy': strategy, 'min_rate': min_rate},
+                           'Parameter min_rate must be a number in'
+                           '[0, 1]. Got {} instead.'.format(min_rate))
+                          for (strategy, min_rate) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [None, 'weird', -0.2, 1.2, 3 + 2j])] +
+                         [({'strategy': 'f_beta', 'beta': beta},
+                           'Parameter beta must be a real number. '
+                           'Got {} instead.'.format(type(beta)))
+                          for beta in [None, 'weird', 3 + 2j]]
+                         )
+def test_validate_calibration_params_invalid_parameters_right_error(
+        estimator, _, invalid_args, expected_msg):
+  # test that the right error message is returned if invalid arguments are
+  # given to _validate_calibration_params, for all pairs metric learners as
+  # well as a mocking general identity pairs classifier and the class itself
+  with pytest.raises(ValueError) as raised_error:
+    estimator._validate_calibration_params(**invalid_args)
+  assert str(raised_error.value) == expected_msg
+
+
+@pytest.mark.parametrize('estimator, _',
+                         pairs_learners + [(IdentityPairsClassifier(), None),
+                                           (_PairsClassifierMixin, None)],
+                         ids=ids_pairs_learners + ['mock', 'class'])
+@pytest.mark.parametrize('valid_args',
+                         [{}, {'strategy': 'accuracy'}] +
+                         [{'strategy': strategy, 'min_rate': min_rate}
+                          for (strategy, min_rate) in product(
+                             ['max_tpr', 'max_tnr'],
+                             [0., 0.2, 0.8, 1.])] +
+                         [{'strategy': 'f_beta', 'beta': beta}
+                          for beta in [-5., -1., 0., 0.1, 0.2, 1., 5.]]
+                         # Note that we authorize beta < 0 (even if
+                         # in fact it will be squared, so it would be useless
+                         # to do that)
+                         )
+def test_validate_calibration_params_valid_parameters(
+        estimator, _, valid_args):
+  # test that no warning message is returned if valid arguments are given to
+  # _validate_calibration_params for all pairs metric learners, as well as
+  # a mocking example, and the class itself
+  with pytest.warns(None) as record:
+    estimator._validate_calibration_params(**valid_args)
+  assert len(record) == 0
+
+
+@pytest.mark.parametrize('estimator, build_dataset',
+                         pairs_learners,
+                         ids=ids_pairs_learners)
+def test_validate_calibration_params_invalid_parameters_error_before__fit(
+        estimator, build_dataset):
+  """For all pairs metric learners (which currently all have a _fit method),
+  make sure that calibration parameters are validated before fitting"""
+  estimator = clone(estimator)
+  input_data, labels, _, _ = build_dataset()
+
+  def breaking_fun(**args):  # a function that fails so that we will miss
+    # the calibration at the end and therefore the right error message from
+    # validating params should be thrown before
+    raise RuntimeError('Game over.')
+  estimator._fit = breaking_fun
+  expected_msg = ('Strategy can either be "accuracy", "f_beta" or '
+                  '"max_tpr" or "max_tnr". Got "weird" instead.')
+  with pytest.raises(ValueError) as raised_error:
+    estimator.fit(input_data, labels, calibration_params={'strategy': 'weird'})
+  assert str(raised_error.value) == expected_msg

From 14713c638646f40af7b42538ad799bb12f70f3c3 Mon Sep 17 00:00:00 2001
From: William de Vazelhes <william.de-vazelhes@inria.fr>
Date: Tue, 2 Apr 2019 15:48:09 +0200
Subject: [PATCH 41/41] Address
 https://github.com/metric-learn/metric-learn/pull/168#discussion_r268109180

---
 metric_learn/base_metric.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
index d4f00b13..9f127f58 100644
--- a/metric_learn/base_metric.py
+++ b/metric_learn/base_metric.py
@@ -487,7 +487,8 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       cum_accuracy = (cum_tp + cum_tn) / n_samples
       imax = np.argmax(cum_accuracy)
       # we set the threshold to the lowest accepted score
-      # note: we want a positive threshold (distance), so we take - threshold
+      # note: we are working with negative distances but we want the threshold
+      # to be with respect to the actual distances so we take minus sign
       self.threshold_ = - scores_sorted[imax]
       # note: if the best is to reject all points it's already one of the
       # thresholds (scores_sorted[0])
@@ -510,7 +511,8 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
       f_beta[np.isnan(f_beta)] = 0.
       imax = np.argmax(f_beta)
       # we set the threshold to the lowest accepted score
-      # note: we want a positive threshold (distance), so we take - threshold
+      # note: we are working with negative distances but we want the threshold
+      # to be with respect to the actual distances so we take minus sign
       self.threshold_ = - thresholds[imax]
       # Note: we don't need to deal with rejecting all points (i.e. threshold =
       # max_scores + 1), since this can never happen to be optimal
@@ -533,7 +535,8 @@ def calibrate_threshold(self, pairs_valid, y_valid, strategy='accuracy',
         imax = np.argmax(1 - fpr[indices])
 
       imax_valid = indices[imax]
-      # note: we want a positive threshold (distance), so we take - threshold
+      # note: we are working with negative distances but we want the threshold
+      # to be with respect to the actual distances so we take minus sign
       if indices[imax] == len(thresholds):  # we want to accept everything
         self.threshold_ = - (thresholds[imax_valid] - 1)
       else: