scikit-learn-contrib · wdevazelhes · Feb 26, 2018 · Feb 26, 2018 · Feb 27, 2018 · Feb 28, 2018
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
@@ -1,8 +1,43 @@
-from numpy.linalg import inv, cholesky
-from sklearn.base import BaseEstimator, TransformerMixin
+from numpy.linalg import cholesky
+from sklearn.base import BaseEstimator
 from sklearn.utils.validation import check_array
 
 
+class TransformerMixin(object):
+  """Mixin class for all transformers in metric-learn. Same as the one in
+  scikit-learn, but the documentation is changed: this Transformer is
+  allowed to take as y a non array-like input"""
+
+  def fit_transform(self, X, y=None, **fit_params):
+    """Fit to data, then transform it.
+
+    Fits transformer to X and y with optional parameters fit_params
+    and returns a transformed version of X.
+
+    Parameters
+    ----------
+    X : numpy array of shape [n_samples, n_features]
+        Training set.
+
+    y : numpy array of shape [n_samples] or 4-tuple of arrays
+        Target values, or constraints (a, b, c, d) indices into X, with
+        (a, b) specifying similar and (c,d) dissimilar pairs).
+
+    Returns
+    -------
+    X_new : numpy array of shape [n_samples, n_features_new]
+        Transformed array.
+
+    """
+    # non-optimized default implementation; override when a better
+    # method is possible for a given clustering algorithm
+    if y is None:
+      # fit method of arity 1 (unsupervised transformation)
+      return self.fit(X, **fit_params).transform(X)
+    else:
+      # fit method of arity 2 (supervised transformation)
+      return self.fit(X, y, **fit_params).transform(X)
+
 class BaseMetricLearner(BaseEstimator, TransformerMixin):
   def __init__(self):
     raise NotImplementedError('BaseMetricLearner should not be instantiated')
@@ -49,3 +84,45 @@ def transform(self, X=None):
       X = check_array(X, accept_sparse=True)
     L = self.transformer()
     return X.dot(L.T)
+
+
+class SupervisedMixin(object):
+
+  def fit(self, X, y):
+    return NotImplementedError
+
+
+class UnsupervisedMixin(object):
+
+  def fit(self, X, y=None):
+    return NotImplementedError
+
+
+class WeaklySupervisedMixin(object):
+
+  def fit(self, X, constraints, **kwargs):
+    return self._fit(X, constraints, **kwargs)
+
+
+class PairsMixin(WeaklySupervisedMixin):
+
+  def __init__(self):
+    raise NotImplementedError('PairsMixin should not be instantiated')
+  # TODO: introduce specific scoring functions etc
+
+
+class TripletsMixin(WeaklySupervisedMixin):
+
+  def __init__(self):
+    raise NotImplementedError('TripletsMixin should not be '
+                              'instantiated')
+  # TODO: introduce specific scoring functions etc
+
+
+class QuadrupletsMixin(UnsupervisedMixin):
+
+  def __init__(self):
+    raise NotImplementedError('QuadrupletsMixin should not be '
+                              'instantiated')
+  # TODO: introduce specific scoring functions etc
+
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
@@ -6,8 +6,9 @@
 import warnings
 from six.moves import xrange
 from scipy.sparse import coo_matrix
+from sklearn.utils import check_array
 
-__all__ = ['Constraints']
+__all__ = ['Constraints', 'ConstrainedDataset']
 
 
 class Constraints(object):
@@ -18,17 +19,6 @@ def __init__(self, partial_labels):
     self.known_label_idx, = np.where(partial_labels >= 0)
     self.known_labels = partial_labels[self.known_label_idx]
 
-  def adjacency_matrix(self, num_constraints, random_state=np.random):
-    a, b, c, d = self.positive_negative_pairs(num_constraints,
-                                              random_state=random_state)
-    row = np.concatenate((a, c))
-    col = np.concatenate((b, d))
-    data = np.ones_like(row, dtype=int)
-    data[len(a):] = -1
-    adj = coo_matrix((data, (row, col)), shape=(self.num_points,)*2)
-    # symmetrize
-    return adj + adj.T
-
   def positive_negative_pairs(self, num_constraints, same_length=False,
                               random_state=np.random):
     a, b = self._pairs(num_constraints, same_label=True,
@@ -100,3 +90,88 @@ def random_subset(all_labels, num_preserved=np.inf, random_state=np.random):
     partial_labels = np.array(all_labels, copy=True)
     partial_labels[idx] = -1
     return Constraints(partial_labels)
+
+
+class ConstrainedDataset(object):
+
+  def __init__(self, X, c):
+    # we convert the data to a suitable format
+    self.X = check_array(X, accept_sparse=True, dtype=None, warn_on_dtype=True)
+    self.c = check_array(c, dtype=['int'] + np.sctypes['int']
+                                  + np.sctypes['uint'],
+                         # we add 'int' at the beginning to tell it is the
+                         # default format we want in case of conversion
+                         ensure_2d=False, ensure_min_samples=False,
+                         ensure_min_features=False, warn_on_dtype=True)
+    self._check_index(self.X.shape[0], self.c)
+    self.shape = (len(c) if hasattr(c, '__len__') else 0, self.X.shape[1])
+
+  def __getitem__(self, item):
+    return ConstrainedDataset(self.X, self.c[item])
+
+  def __len__(self):
+    return self.shape
+
+  def __str__(self):
+    return self.toarray().__str__()
+
+  def __repr__(self):
+    return self.toarray().__repr__()
+
+  def toarray(self):
+    return self.X[self.c]
+
+  @staticmethod
+  def _check_index(length, indices):
+    max_index = np.max(indices)
+    min_index = np.min(indices)
+    pb_index = None
+    if max_index >= length:
+      pb_index = max_index
+    elif min_index > length + 1:
+      pb_index = min_index
+    if pb_index is not None:
+      raise IndexError("ConstrainedDataset cannot be created: the length of "
+                       "the dataset is {}, so index {} is out of range."
+                       .format(length, pb_index))
+
+  @staticmethod
+  def pairs_from_labels(y):
+    # TODO: to be implemented
+    raise NotImplementedError
+
+  @staticmethod
+  def triplets_from_labels(y):
+    # TODO: to be implemented
+    raise NotImplementedError
+
+
+def unwrap_pairs(X_constrained, y):
+  a = X_constrained.c[(y == 0)[:, 0]][:, 0]
+  b = X_constrained.c[(y == 0)[:, 0]][:, 1]
+  c = X_constrained.c[(y == 1)[:, 0]][:, 0]
+  d = X_constrained.c[(y == 1)[:, 0]][:, 1]
+  X = X_constrained.X
+  return X, [a, b, c, d]
+
+def wrap_pairs(X, constraints):
+  a = np.array(constraints[0])
+  b = np.array(constraints[1])
+  c = np.array(constraints[2])
+  d = np.array(constraints[3])
+  constraints = np.vstack([np.hstack([a[:, None], b[:, None]]),
+                           np.hstack([c[:, None], d[:, None]])])
+  y = np.vstack([np.zeros((len(a), 1)), np.ones((len(c), 1))])
+  X_constrained = ConstrainedDataset(X, constraints)
+  return X_constrained, y
+
+def unwrap_to_graph(X_constrained, y):
+
+  X, [a, b, c, d] = unwrap_pairs(X_constrained, y)
+  row = np.concatenate((a, c))
+  col = np.concatenate((b, d))
+  data = np.ones_like(row, dtype=int)
+  data[len(a):] = -1
+  adj = coo_matrix((data, (row, col)), shape=(X_constrained.X.shape[0],)
+                                             * 2)
+  return X_constrained.X, adj + adj.T
diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py
@@ -12,17 +12,17 @@
 import numpy as np
 from sklearn.utils.validation import check_array
 
-from .base_metric import BaseMetricLearner
+from .base_metric import BaseMetricLearner, UnsupervisedMixin
 
 
-class Covariance(BaseMetricLearner):
+class _Covariance(BaseMetricLearner):
   def __init__(self):
     pass
 
   def metric(self):
     return self.M_
 
-  def fit(self, X, y=None):
+  def _fit(self, X, y=None):
     """
     X : data matrix, (n x d)
     y : unused
@@ -34,3 +34,8 @@ def fit(self, X, y=None):
     else:
       self.M_ = np.linalg.inv(self.M_)
     return self
+
+class Covariance(_Covariance, UnsupervisedMixin):
+
+  def fit(self, X, y=None):
+    return self._fit(X, y)
diff --git a/metric_learn/itml.py b/metric_learn/itml.py
@@ -19,12 +19,12 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_array, check_X_y
 
-from .base_metric import BaseMetricLearner
-from .constraints import Constraints
+from .base_metric import BaseMetricLearner, PairsMixin, SupervisedMixin
+from .constraints import Constraints, unwrap_pairs, wrap_pairs
 from ._util import vector_norm
 
 
-class ITML(BaseMetricLearner):
+class _ITML(BaseMetricLearner):
   """Information Theoretic Metric Learning (ITML)"""
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                A0=None, verbose=False):
@@ -73,19 +73,19 @@ def _process_inputs(self, X, constraints, bounds):
       self.A_ = check_array(self.A0)
     return a,b,c,d
 
-  def fit(self, X, constraints, bounds=None):
+  def _fit(self, X_constrained, y, bounds=None):
     """Learn the ITML model.
 
     Parameters
     ----------
-    X : (n x d) data matrix
-        each row corresponds to a single instance
-    constraints : 4-tuple of arrays
-        (a,b,c,d) indices into X, with (a,b) specifying positive and (c,d)
-        negative pairs
+    X_constrained : ConstrainedDataset
+        with constraints being an array of shape [n_constraints, 2]
+    y : array-like, shape (n_constraints x 1)
+        labels of the constraints
     bounds : list (pos,neg) pairs, optional
         bounds on similarity, s.t. d(X[a],X[b]) < pos and d(X[c],X[d]) > neg
     """
+    X, constraints = unwrap_pairs(X_constrained, y)
     a,b,c,d = self._process_inputs(X, constraints, bounds)
     gamma = self.gamma
     num_pos = len(a)
@@ -140,7 +140,7 @@ def metric(self):
     return self.A_
 
 
-class ITML_Supervised(ITML):
+class ITML_Supervised(_ITML, SupervisedMixin):
   """Information Theoretic Metric Learning (ITML)"""
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                num_labeled=np.inf, num_constraints=None, bounds=None, A0=None,
@@ -164,7 +164,7 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
     verbose : bool, optional
         if True, prints information while learning
     """
-    ITML.__init__(self, gamma=gamma, max_iter=max_iter,
+    _ITML.__init__(self, gamma=gamma, max_iter=max_iter,
                   convergence_threshold=convergence_threshold,
                   A0=A0, verbose=verbose)
     self.num_labeled = num_labeled
@@ -195,4 +195,9 @@ def fit(self, X, y, random_state=np.random):
                                   random_state=random_state)
     pos_neg = c.positive_negative_pairs(num_constraints,
                                         random_state=random_state)
-    return ITML.fit(self, X, pos_neg, bounds=self.bounds)
+    X_constrained, y = wrap_pairs(X, pos_neg)
+    return _ITML._fit(self, X_constrained, y, bounds=self.bounds)
+
+class ITML(_ITML, PairsMixin):
+
+  pass
diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
@@ -18,10 +18,10 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import BaseMetricLearner, SupervisedMixin
 
 
-class LFDA(BaseMetricLearner):
+class _LFDA(BaseMetricLearner):
   '''
   Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
   Sugiyama, ICML 2006
@@ -77,7 +77,7 @@ def _process_inputs(self, X, y):
 
     return self.X_, y, num_classes, n, d, dim, k
 
-  def fit(self, X, y):
+  def _fit(self, X, y):
     '''Fit the LFDA model.
 
     Parameters
@@ -146,3 +146,19 @@ def _eigh(a, b, dim):
   except np.linalg.LinAlgError:
     pass
   return scipy.linalg.eig(a, b)
+
+
+class LFDA(_LFDA, SupervisedMixin):
+
+  def fit(self, X, y):
+    '''Fit the LFDA model.
+
+    Parameters
+    ----------
+    X : (n, d) array-like
+        Input data.
+
+    y : (n,) array-like
+        Class labels, one per point of data.
+    '''
+    return self._fit(X, y)