scikit-learn-contrib · wdevazelhes · Feb 26, 2018 · Feb 26, 2018 · Feb 27, 2018 · Feb 28, 2018
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
@@ -49,3 +49,37 @@ def transform(self, X=None):
       X = check_array(X, accept_sparse=True)
     L = self.transformer()
     return X.dot(L.T)
+
+
+class SupervisedMetricLearner(BaseMetricLearner):
+
+  def fit(self, X, y):
+    return NotImplementedError
+
+
+class WeaklySupervisedMetricLearner(BaseMetricLearner):
+
+  def fit(self, constrained_dataset, y):
+    return NotImplementedError
+
+
+class PairsMetricLearner(WeaklySupervisedMetricLearner):
+
+  def __init__(self):
+    raise NotImplementedError('PairsMetricLearner should not be instantiated')
+  # TODO: introduce specific scoring functions etc
+
+
+class TripletsMetricLearner(WeaklySupervisedMetricLearner):
+
+  def __init__(self):
+    raise NotImplementedError('TripletsMetricLearner should not be '
+                              'instantiated')
+  # TODO: introduce specific scoring functions etc
+
+class QuadrupletsMetricLearner(WeaklySupervisedMetricLearner):
+
+  def __init__(self):
+    raise NotImplementedError('QuadrupletsMetricLearner should not be '
+                              'instantiated')
+  # TODO: introduce specific scoring functions etc
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
@@ -6,8 +6,9 @@
 import warnings
 from six.moves import xrange
 from scipy.sparse import coo_matrix
+from sklearn.utils import check_array
 
-__all__ = ['Constraints']
+__all__ = ['Constraints', 'ConstrainedDataset']
 
 
 class Constraints(object):
@@ -18,17 +19,6 @@ def __init__(self, partial_labels):
     self.known_label_idx, = np.where(partial_labels >= 0)
     self.known_labels = partial_labels[self.known_label_idx]
 
-  def adjacency_matrix(self, num_constraints, random_state=np.random):
-    a, b, c, d = self.positive_negative_pairs(num_constraints,
-                                              random_state=random_state)
-    row = np.concatenate((a, c))
-    col = np.concatenate((b, d))
-    data = np.ones_like(row, dtype=int)
-    data[len(a):] = -1
-    adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2)
-    # symmetrize
-    return adj + adj.T
-
   def positive_negative_pairs(self, num_constraints, same_length=False,
                               random_state=np.random):
     a, b = self._pairs(num_constraints, same_label=True,
@@ -100,3 +90,88 @@ def random_subset(all_labels, num_preserved=np.inf, random_state=np.random):
     partial_labels = np.array(all_labels, copy=True)
     partial_labels[idx] = -1
     return Constraints(partial_labels)
+
+
+class ConstrainedDataset(object):
+
+  def __init__(self, X, c):
+    # we convert the data to a suitable format
+    self.X = check_array(X, accept_sparse=True, dtype=None, warn_on_dtype=True)
+    self.c = check_array(c, dtype=['int'] + np.sctypes['int']
+                                  + np.sctypes['uint'],
+                         # we add 'int' at the beginning to tell it is the
+                         # default format we want in case of conversion
+                         ensure_2d=False, ensure_min_samples=False,
+                         ensure_min_features=False, warn_on_dtype=True)
+    self._check_index(self.X.shape[0], self.c)
+    self.shape = (len(c) if hasattr(c, '__len__') else 0, self.X.shape[1])
+
+  def __getitem__(self, item):
+    return ConstrainedDataset(self.X, self.c[item])
+
+  def __len__(self):
+    return self.shape
+
+  def __str__(self):
+    return self.toarray().__str__()
+
+  def __repr__(self):
+    return self.toarray().__repr__()
+
+  def toarray(self):
+    return self.X[self.c]
+
+  @staticmethod
+  def _check_index(length, indices):
+    max_index = np.max(indices)
+    min_index = np.min(indices)
+    pb_index = None
+    if max_index >= length:
+      pb_index = max_index
+    elif min_index > length + 1:
+      pb_index = min_index
+    if pb_index is not None:
+      raise IndexError("ConstrainedDataset cannot be created: the length of "
+                       "the dataset is {}, so index {} is out of range."
+                       .format(length, pb_index))
+
+  @staticmethod
+  def pairs_from_labels(y):
+    # TODO: to be implemented
+    raise NotImplementedError
+
+  @staticmethod
+  def triplets_from_labels(y):
+    # TODO: to be implemented
+    raise NotImplementedError
+
+
+def unwrap_pairs(constrained_dataset, y):
+  a = constrained_dataset.c[(y == 0)[:, 0]][:, 0]
+  b = constrained_dataset.c[(y == 0)[:, 0]][:, 1]
+  c = constrained_dataset.c[(y == 1)[:, 0]][:, 0]
+  d = constrained_dataset.c[(y == 1)[:, 0]][:, 1]
+  X = constrained_dataset.X
+  return X, [a, b, c, d]
+
+def wrap_pairs(X, constraints):
+  a = np.array(constraints[0])
+  b = np.array(constraints[1])
+  c = np.array(constraints[2])
+  d = np.array(constraints[3])
+  constraints = np.vstack([np.hstack([a[:, None], b[:, None]]),
+                           np.hstack([c[:, None], d[:, None]])])
+  y = np.vstack([np.zeros((len(a), 1)), np.ones((len(c), 1))])
+  constrained_dataset = ConstrainedDataset(X, constraints)
+  return constrained_dataset, y
+
+def unwrap_to_graph(constrained_dataset, y):
+
+  X, [a, b, c, d] = unwrap_pairs(constrained_dataset, y)
+  row = np.concatenate((a, c))
+  col = np.concatenate((b, d))
+  data = np.ones_like(row, dtype=int)
+  data[len(a):] = -1
+  adj = coo_matrix((data, (row, col)), shape=(constrained_dataset.X.shape[0],)
+                                             * 2)
+  return constrained_dataset.X, adj + adj.T
diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py
@@ -12,10 +12,10 @@
 import numpy as np
 from sklearn.utils.validation import check_array
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 
-class Covariance(BaseMetricLearner):
+class Covariance(SupervisedMetricLearner):
   def __init__(self):
     pass
 

diff --git a/metric_learn/itml.py b/metric_learn/itml.py
@@ -19,12 +19,12 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_array, check_X_y
 
-from .base_metric import BaseMetricLearner
-from .constraints import Constraints
+from .base_metric import PairsMetricLearner, SupervisedMetricLearner
+from .constraints import Constraints, unwrap_pairs, wrap_pairs
 from ._util import vector_norm
 
 
-class ITML(BaseMetricLearner):
+class ITML(PairsMetricLearner):
   """Information Theoretic Metric Learning (ITML)"""
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                A0=None, verbose=False):
@@ -73,19 +73,19 @@ def _process_inputs(self, X, constraints, bounds):
       self.A_ = check_array(self.A0)
     return a,b,c,d
 
-  def fit(self, X, constraints, bounds=None):
+  def fit(self, constrained_dataset, y, bounds=None):
     """Learn the ITML model.
 
     Parameters
     ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, with (a,b) specifying positive and (c,d)
-        negative pairs
+    constrained_dataset : ConstrainedDataset
+        with constraints being an array of shape [n_constraints, 2]
+    y : array-like, shape (n x 1)
+        labels of the constraints
     bounds : list (pos,neg) pairs, optional
         bounds on similarity, s.t. d(X[a],X[b]) < pos and d(X[c],X[d]) > neg
     """
+    X, constraints = unwrap_pairs(constrained_dataset, y)
     a,b,c,d = self._process_inputs(X, constraints, bounds)
     gamma = self.gamma
     num_pos = len(a)
@@ -140,7 +140,7 @@ def metric(self):
     return self.A_
 
 
-class ITML_Supervised(ITML):
+class ITML_Supervised(ITML, SupervisedMetricLearner):
   """Information Theoretic Metric Learning (ITML)"""
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                num_labeled=np.inf, num_constraints=None, bounds=None, A0=None,
@@ -195,4 +195,5 @@ def fit(self, X, y, random_state=np.random):
                                   random_state=random_state)
     pos_neg = c.positive_negative_pairs(num_constraints,
                                         random_state=random_state)
-    return ITML.fit(self, X, pos_neg, bounds=self.bounds)
+    constrained_dataset, y = wrap_pairs(X, pos_neg)
+    return ITML.fit(self, constrained_dataset, y, bounds=self.bounds)
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
@@ -18,10 +18,10 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 
-class LFDA(BaseMetricLearner):
+class LFDA(SupervisedMetricLearner):
   '''
   Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
   Sugiyama, ICML 2006

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
@@ -17,11 +17,11 @@
 from sklearn.utils.validation import check_X_y, check_array
 from sklearn.metrics import euclidean_distances
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 
 # commonality between LMNN implementations
-class _base_LMNN(BaseMetricLearner):
+class _base_LMNN(SupervisedMetricLearner):
   def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7,
                regularization=0.5, convergence_tol=0.001, use_pca=True,
                verbose=False):

diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
@@ -13,11 +13,11 @@
 from six.moves import xrange
 from sklearn.utils.validation import check_array, check_X_y
 
-from .base_metric import BaseMetricLearner
-from .constraints import Constraints
+from .base_metric import SupervisedMetricLearner, QuadrupletsMetricLearner
+from .constraints import Constraints, ConstrainedDataset
 
 
-class LSML(BaseMetricLearner):
+class LSML(QuadrupletsMetricLearner):
   def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False):
     """Initialize LSML.
 
@@ -57,18 +57,23 @@ def _prepare_inputs(self, X, constraints, weights):
   def metric(self):
     return self.M_
 
-  def fit(self, X, constraints, weights=None):
+  def fit(self, constrained_dataset, y=None, weights=None):
     """Learn the LSML model.
 
     Parameters
     ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, such that d(X[a],X[b]) < d(X[c],X[d])
+    constrained_dataset : ConstrainedDataset
+        with constraints being an array of shape [n_constraints, 4]. It
+        should be the concatenation of 4 column vectors a, b, c and d,
+        such that: ``d(X[a[i]],X[b[i]]) < d(X[c[i]],X[d[i]])`` for every
+        constraint index ``i``.
+    y : object
+        Not used, for scikit-learn compatibility
     weights : (m,) array of floats, optional
         scale factor for each constraint
     """
+    X = constrained_dataset.X
+    constraints = [constrained_dataset.c[:, i].ravel() for i in range(4)]
     self._prepare_inputs(X, constraints, weights)
     step_sizes = np.logspace(-10, 0, 10)
     # Keep track of the best step size and the loss at that step.
@@ -131,7 +136,7 @@ def _gradient(self, metric):
     return dMetric
 
 
-class LSML_Supervised(LSML):
+class LSML_Supervised(LSML, SupervisedMetricLearner):
   def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf,
                num_constraints=None, weights=None, verbose=False):
     """Initialize the learner.
@@ -181,4 +186,7 @@ def fit(self, X, y, random_state=np.random):
                                   random_state=random_state)
     pairs = c.positive_negative_pairs(num_constraints, same_length=True,
                                       random_state=random_state)
-    return LSML.fit(self, X, pairs, weights=self.weights)
+    constrained_dataset = ConstrainedDataset(X, np.hstack([pairs[i][:, None]
+                                                           for i in
+                                                           range(4)]))
+    return LSML.fit(self, constrained_dataset, weights=self.weights)
diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py
@@ -13,12 +13,12 @@
 from sklearn.decomposition import PCA
 from sklearn.utils.validation import check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 EPS = np.finfo(float).eps
 
 
-class MLKR(BaseMetricLearner):
+class MLKR(SupervisedMetricLearner):
   """Metric Learning for Kernel Regression (MLKR)"""
   def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001,
                max_iter=1000):

diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
@@ -22,13 +22,14 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_array, check_X_y
 
-from .base_metric import BaseMetricLearner
-from .constraints import Constraints
+from .base_metric import PairsMetricLearner, SupervisedMetricLearner
+from .constraints import Constraints, ConstrainedDataset, unwrap_pairs, \
+  wrap_pairs
 from ._util import vector_norm
 
 
 
-class MMC(BaseMetricLearner):
+class MMC(PairsMetricLearner):
   """Mahalanobis Metric for Clustering (MMC)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
                A0=None, diagonal=False, diagonal_c=1.0, verbose=False):
@@ -58,17 +59,17 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
     self.diagonal_c = diagonal_c
     self.verbose = verbose
 
-  def fit(self, X, constraints):
+  def fit(self, constrained_dataset, y):
     """Learn the MMC model.
 
     Parameters
     ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, with (a,b) specifying similar and (c,d)
-        dissimilar pairs
+    constrained_dataset : ConstrainedDataset
+        with constraints being an array of shape [n_constraints, 2]
+    y : array-like, shape (n x 1)
+        labels of the constraints
     """
+    X, constraints = unwrap_pairs(constrained_dataset, y)
     constraints = self._process_inputs(X, constraints)
     if self.diagonal:
       return self._fit_diag(X, constraints)
@@ -380,7 +381,7 @@ def transformer(self):
       return V.T * np.sqrt(np.maximum(0, w[:,None]))
 
 
-class MMC_Supervised(MMC):
+class MMC_Supervised(MMC, SupervisedMetricLearner):
   """Mahalanobis Metric for Clustering (MMC)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
                num_labeled=np.inf, num_constraints=None,
@@ -437,4 +438,5 @@ def fit(self, X, y, random_state=np.random):
                                   random_state=random_state)
     pos_neg = c.positive_negative_pairs(num_constraints,
                                         random_state=random_state)
-    return MMC.fit(self, X, pos_neg)
+    constrained_dataset, y = wrap_pairs(X, pos_neg)
+    return MMC.fit(self, constrained_dataset, y)