ENH: Add check_tuples

William de Vazelhes · William de Vazelhes · commit 585b5d2fda05 · 2018-07-24T16:08:07.000+02:00
diff --git a/metric_learn/_util.py b/metric_learn/_util.py
@@ -9,4 +9,42 @@ def vector_norm(X):
     return np.apply_along_axis(np.linalg.norm, 1, X)
 else:
   def vector_norm(X):
-    return np.linalg.norm(X, axis=1)
+    return np.linalg.norm(X, axis=1)
+
+
+def check_tuples(tuples):
+  """Check that the input is a valid 3D array representing a dataset of tuples.
+
+  Equivalent of `check_array` in scikit-learn.
+
+  Parameters
+  ----------
+  tuples : object
+    The tuples to check.
+
+  Returns
+  -------
+  tuples_valid : object
+    The validated input.
+  """
+  # If input is scalar raise error
+  if len(tuples.shape) == 0:
+    raise ValueError(
+      "Expected 3D array, got scalar instead. Cannot apply this function on "
+      "scalars.")
+  # If input is 1D raise error
+  if len(tuples.shape) == 1:
+    raise ValueError(
+      "Expected 3D array, got 1D array instead:\ntuples={}.\n"
+      "Reshape your data using tuples.reshape(1, -1, 1) if it contains a "
+      "single tuple and the points in the tuple have a single "
+      "feature.").format(tuples)
+  # If input is 2D raise error
+  if len(tuples.shape) == 2:
+    raise ValueError(
+      "Expected 3D array, got 2D array instead:\ntuples={}.\n"
+      "Reshape your data either using tuples.reshape(-1, {}, 1) if "
+      "your data has a single feature or tuples.reshape(1, {}, -1) "
+      "if it contains a single tuple.".format(tuples, tuples.shape[1],
+                                              tuples.shape[0]))
+  return tuples
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
@@ -5,6 +5,7 @@
 import numpy as np
 from abc import ABCMeta, abstractmethod
 import six
+from ._util import check_tuples
 
 
 class BaseMetricLearner(BaseEstimator):
@@ -86,7 +87,8 @@ def score_pairs(self, pairs):
     scores: `numpy.ndarray` of shape=(n_pairs,)
       The learned Mahalanobis distance for every pair.
     """
-    pairwise_diffs = self.transform(pairs[..., 1, :] - pairs[..., 0, :])
+    pairs = check_tuples(pairs)
+    pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :])
     # (for MahalanobisMixin, the embedding is linear so we can just embed the
     # difference)
     return np.sqrt(np.sum(pairwise_diffs**2, axis=-1))
@@ -108,7 +110,7 @@ def transform(self, X):
     X_embedded : `numpy.ndarray`, shape=(n_samples, num_dims)
       The embedded data points.
     """
-    X_checked = check_array(X, accept_sparse=True, ensure_2d=False)
+    X_checked = check_array(X, accept_sparse=True)
     return X_checked.dot(self.transformer_.T)
 
   def metric(self):
@@ -159,9 +161,11 @@ def predict(self, pairs):
     y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,)
       The predicted learned metric value between samples in every pair.
     """
+    pairs = check_tuples(pairs)
     return self.score_pairs(pairs)
 
   def decision_function(self, pairs):
+    pairs = check_tuples(pairs)
     return self.predict(pairs)
 
   def score(self, pairs, y):
@@ -187,6 +191,7 @@ def score(self, pairs, y):
     score : float
       The ``roc_auc`` score.
     """
+    pairs = check_tuples(pairs)
     return roc_auc_score(y, self.decision_function(pairs))
 
 
@@ -208,6 +213,7 @@ def predict(self, quadruplets):
     prediction : `numpy.ndarray` of floats, shape=(n_constraints,)
       Predictions of the ordering of pairs, for each quadruplet.
     """
+    quadruplets = check_tuples(quadruplets)
     return np.sign(self.decision_function(quadruplets))
 
   def decision_function(self, quadruplets):
@@ -226,8 +232,9 @@ def decision_function(self, quadruplets):
     decision_function : `numpy.ndarray` of floats, shape=(n_constraints,)
       Metric differences.
     """
-    return (self.score_pairs(quadruplets[..., :2, :]) -
-            self.score_pairs(quadruplets[..., 2:, :]))
+    quadruplets = check_tuples(quadruplets)
+    return (self.score_pairs(quadruplets[:, :2, :]) -
+            self.score_pairs(quadruplets[:, 2:, :]))
 
   def score(self, quadruplets, y=None):
     """Computes score on input quadruplets
@@ -248,4 +255,5 @@ def score(self, quadruplets, y=None):
     score : float
       The quadruplets score.
     """
+    quadruplets = check_tuples(quadruplets)
     return - np.mean(self.predict(quadruplets))
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
@@ -21,7 +21,7 @@
 from sklearn.base import TransformerMixin
 from .base_metric import _PairsClassifierMixin, MahalanobisMixin
 from .constraints import Constraints, wrap_pairs
-from ._util import vector_norm
+from ._util import vector_norm, check_tuples
 
 
 class _BaseITML(MahalanobisMixin):
@@ -52,8 +52,11 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
     self.verbose = verbose
 
   def _process_pairs(self, pairs, y, bounds):
+    # for now we check_X_y and check_tuples but we should only
+    # check_tuples_y in the future
     pairs, y = check_X_y(pairs, y, accept_sparse=False,
                          ensure_2d=False, allow_nd=True)
+    pairs = check_tuples(pairs)
 
     # check to make sure that no two constrained vectors are identical
     pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
@@ -13,6 +13,7 @@
 from six.moves import xrange
 from sklearn.base import TransformerMixin
 from sklearn.utils.validation import check_array, check_X_y
+from ._util import check_tuples
 
 from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin
 from .constraints import Constraints
@@ -37,8 +38,11 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False):
     self.verbose = verbose
 
   def _prepare_quadruplets(self, quadruplets, weights):
-    pairs = check_array(quadruplets, accept_sparse=False,
-                                      ensure_2d=False, allow_nd=True)
+    # for now we check_array and check_tuples but we should only
+    # check_tuples in the future (with enhanced check_tuples)
+    quadruplets = check_array(quadruplets, accept_sparse=False,
+                              ensure_2d=False, allow_nd=True)
+    quadruplets = check_tuples(quadruplets)
 
     # check to make sure that no two constrained vectors are identical
     self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :]
@@ -51,7 +55,8 @@ def _prepare_quadruplets(self, quadruplets, weights):
       self.w_ = weights
     self.w_ /= self.w_.sum()  # weights must sum to 1
     if self.prior is None:
-      X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])})
+      X = np.vstack({tuple(row) for row in
+                     quadruplets.reshape(-1, quadruplets.shape[2])})
       self.prior_inv_ = np.atleast_2d(np.cov(X, rowvar=False))
       self.M_ = np.linalg.inv(self.prior_inv_)
     else:
diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
@@ -24,7 +24,7 @@
 
 from .base_metric import _PairsClassifierMixin, MahalanobisMixin
 from .constraints import Constraints, wrap_pairs
-from ._util import vector_norm
+from ._util import vector_norm, check_tuples
 
 
 class _BaseMMC(MahalanobisMixin):
@@ -65,8 +65,11 @@ def _fit(self, pairs, y):
       return self._fit_full(pairs, y)
 
   def _process_pairs(self, pairs, y):
+    # for now we check_X_y and check_tuples but we should only
+    # check_tuples_y in the future
     pairs, y = check_X_y(pairs, y, accept_sparse=False,
-                                      ensure_2d=False, allow_nd=True)
+                         ensure_2d=False, allow_nd=True)
+    pairs = check_tuples(pairs)
 
     # check to make sure that no two constrained vectors are identical
     pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1]
diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
@@ -17,6 +17,7 @@
 
 from .base_metric import MahalanobisMixin, _PairsClassifierMixin
 from .constraints import Constraints, wrap_pairs
+from ._util import check_tuples
 
 
 class _BaseSDML(MahalanobisMixin):
@@ -43,8 +44,12 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True,
     self.verbose = verbose
 
   def _prepare_pairs(self, pairs, y):
+    # for now we check_X_y and check_tuples but we should only
+    # check_tuples_y in the future
     pairs, y = check_X_y(pairs, y, accept_sparse=False,
-                                      ensure_2d=False, allow_nd=True)
+                         ensure_2d=False, allow_nd=True)
+    pairs = check_tuples(pairs)
+
     # set up prior M
     if self.use_cov:
       X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])})
diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py
@@ -128,14 +128,21 @@ def test_score_pairs_finite(estimator, build_dataset):
                          ids=ids_estimators)
 def test_score_pairs_dim(estimator, build_dataset):
   # scoring of 3D arrays should return 1D array (several tuples),
-  # and scoring of 2D arrays (one tuple) should return a scalar (0D array).
+  # and scoring of 2D arrays (one tuple) should return an error (like
+  # scikit-learn's error when scoring 1D arrays)
   inputs, labels = build_dataset()
   model = clone(estimator)
   model.fit(inputs, labels)
   X, _ = load_iris(return_X_y=True)
   tuples = np.array(list(product(X, X)))
   assert model.score_pairs(tuples).shape == (tuples.shape[0],)
-  assert np.isscalar(model.score_pairs(tuples[1]))
+  msg = ("Expected 3D array, got 2D array instead:\ntuples={}.\n"
+         "Reshape your data either using tuples.reshape(-1, {}, 1) if "
+         "your data has a single feature or tuples.reshape(1, {}, -1) "
+         "if it contains a single tuple.".format(tuples, tuples.shape[1],
+                                                 tuples.shape[0]))
+  with pytest.raises(ValueError, message=msg):
+    model.score_pairs(tuples[1])
 
 
 def check_is_distance_matrix(pairwise):
@@ -174,13 +181,22 @@ def test_embed_dim(estimator, build_dataset):
   model.fit(inputs, labels)
   X, _ = load_iris(return_X_y=True)
   assert model.transform(X).shape == X.shape
-  assert model.transform(X[0, :]).shape == (len(X[0]),)
+
+  # assert that ValueError is thrown if input shape is 1D
+  err_msg = ("Expected 2D array, got 1D array instead:\narray={}.\n"
+             "Reshape your data either using array.reshape(-1, 1) if "
+             "your data has a single feature or array.reshape(1, -1) "
+             "if it contains a single sample.".format(X))
+  with pytest.raises(ValueError, message=err_msg):
+    model.score_pairs(model.transform(X[0, :]))
   # we test that the shape is also OK when doing dimensionality reduction
   if type(model).__name__ in {'LFDA', 'MLKR', 'NCA', 'RCA'}:
     model.set_params(num_dims=2)
     model.fit(inputs, labels)
     assert model.transform(X).shape == (X.shape[0], 2)
-    assert model.transform(X[0, :]).shape == (2,)
+    # assert that ValueError is thrown if input shape is 1D
+    with pytest.raises(ValueError, message=err_msg):
+        model.transform(model.transform(X[0, :]))
 
 
 @pytest.mark.parametrize('estimator, build_dataset', list_estimators,
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -0,0 +1,29 @@
+import numpy as np
+import pytest
+from metric_learn._util import check_tuples
+
+
+def test_check_tuples():
+  X = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+  check_tuples(X)
+
+  X = np.array(5)
+  msg = ("Expected 3D array, got scalar instead. Cannot apply this function "
+         "on scalars.")
+  with pytest.raises(ValueError, message=msg):
+    check_tuples(X)
+
+  X = np.array([1, 2, 3])
+  msg = ("Expected 3D array, got 1D array instead:\ntuples=[1, 2, 3].\n"
+         "Reshape your data using tuples.reshape(1, -1, 1) if it contains a "
+         "single tuple and the points in the tuple have a single feature.")
+  with pytest.raises(ValueError, message=msg):
+    check_tuples(X)
+
+  X = np.array([[1, 2, 3], [2, 3, 5]])
+  msg = ("Expected 3D array, got 2D array instead:\ntuples=[[1, 2, 3], "
+         "[2, 3, 5]].\nReshape your data either using "
+         "tuples.reshape(-1, 3, 1) if your data has a single feature or "
+         "tuples.reshape(1, 2, -1) if it contains a single tuple.")
+  with pytest.raises(ValueError, message=msg):
+    check_tuples(X)
diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py
@@ -169,17 +169,24 @@ def test_dict_unchanged(estimator, build_dataset):
   (tuples, y, tuples_train, tuples_test,
    y_train, y_test) = build_dataset()
   estimator = clone(estimator)
-  if hasattr(estimator, "n_components"):
-    estimator.n_components = 1
+  if hasattr(estimator, "num_dims"):
+    estimator.num_dims = 1
   estimator.fit(tuples, y)
-  for method in ["predict", "transform", "decision_function",
-                 "predict_proba"]:
+  for method in ["predict", "decision_function", "predict_proba"]:
     if hasattr(estimator, method):
       dict_before = estimator.__dict__.copy()
       getattr(estimator, method)(tuples)
       assert estimator.__dict__ == dict_before, \
           ("Estimator changes __dict__ during %s"
            % method)
+    for method in ["transform"]:
+        if hasattr(estimator, method):
+            dict_before = estimator.__dict__.copy()
+            # we transform only 2D arrays (dataset of points)
+            getattr(estimator, method)(tuples[:, 0, :])
+            assert estimator.__dict__ == dict_before, \
+                ("Estimator changes __dict__ during %s"
+                 % method)
 
 
 @pytest.mark.parametrize('estimator, build_dataset', list_estimators,
@@ -190,8 +197,8 @@ def test_dont_overwrite_parameters(estimator, build_dataset):
   (tuples, y, tuples_train, tuples_test,
    y_train, y_test) = build_dataset()
   estimator = clone(estimator)
-  if hasattr(estimator, "n_components"):
-    estimator.n_components = 1
+  if hasattr(estimator, "num_dims"):
+    estimator.num_dims = 1
   dict_before_fit = estimator.__dict__.copy()
 
   estimator.fit(tuples, y)