scikit-learn-contrib · wdevazelhes · Feb 26, 2018 · Feb 26, 2018 · Feb 27, 2018 · Feb 28, 2018
diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py
@@ -49,3 +49,31 @@ def transform(self, X=None):
       X = check_array(X, accept_sparse=True)
     L = self.transformer()
     return X.dot(L.T)
+
+
+class SupervisedMetricLearner(BaseMetricLearner):
+
+  def fit(self, X, y):
+    return NotImplementedError
+
+
+class WeaklySupervisedMetricLearner(BaseMetricLearner):
+
+  def fit(self, X, constraints):
+    return NotImplementedError
+
+
+class PairsMetricLearner(WeaklySupervisedMetricLearner):
+
+  def __init__(self):
+    raise NotImplementedError('PairsMetricLearner should not be instantiated')
+  # TODO: introduce specific scoring functions etc
+
+
+class TripletsMetricLearner(WeaklySupervisedMetricLearner):
+
+  def __init__(self):
+    raise NotImplementedError('TripletsMetricLearner should not be '
+                              'instantiated')
+  # TODO: introduce specific scoring functions etc
+
diff --git a/metric_learn/constraints.py b/metric_learn/constraints.py
@@ -6,8 +6,9 @@
 import warnings
 from six.moves import xrange
 from scipy.sparse import coo_matrix
+from sklearn.utils import check_array
 
-__all__ = ['Constraints']
+__all__ = ['Constraints', 'ConstrainedDataset']
 
 
 class Constraints(object):
@@ -100,3 +101,57 @@ def random_subset(all_labels, num_preserved=np.inf, random_state=np.random):
     partial_labels = np.array(all_labels, copy=True)
     partial_labels[idx] = -1
     return Constraints(partial_labels)
+
+
+class ConstrainedDataset(object):
+
+  def __init__(self, X, c):
+    # we convert the data to a suitable format
+    self.X = check_array(X, accept_sparse=True, dtype=None, warn_on_dtype=True)
+    self.c = check_array(c, dtype=['int'] + np.sctypes['int']
+                                  + np.sctypes['uint'],
+                         # we add 'int' at the beginning to tell it is the
+                         # default format we want in case of conversion
+                         ensure_2d=False, ensure_min_samples=False,
+                         ensure_min_features=False, warn_on_dtype=True)
+    self._check_index(self.X.shape[0], self.c)
+    self.shape = (len(c) if hasattr(c, '__len__') else 0, self.X.shape[1])
+
+  def __getitem__(self, item):
+    return ConstrainedDataset(self.X, self.c[item])
+
+  def __len__(self):
+    return self.shape
+
+  def __str__(self):
+    return self.toarray().__str__()
+
+  def __repr__(self):
+    return self.toarray().__repr__()
+
+  def toarray(self):
+    return self.X[self.c]
+
+  @staticmethod
+  def _check_index(length, indices):
+    max_index = np.max(indices)
+    min_index = np.min(indices)
+    pb_index = None
+    if max_index >= length:
+      pb_index = max_index
+    elif min_index > length + 1:
+      pb_index = min_index
+    if pb_index is not None:
+      raise IndexError("ConstrainedDataset cannot be created: the length of "
+                       "the dataset is {}, so index {} is out of range."
+                       .format(length, pb_index))
+
+  @staticmethod
+  def pairs_from_labels(y):
+    # TODO: to be implemented
+    raise NotImplementedError
+
+  @staticmethod
+  def triplets_from_labels(y):
+    # TODO: to be implemented
+    raise NotImplementedError
diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py
@@ -12,10 +12,10 @@
 import numpy as np
 from sklearn.utils.validation import check_array
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 
-class Covariance(BaseMetricLearner):
+class Covariance(SupervisedMetricLearner):
   def __init__(self):
     pass
 

diff --git a/metric_learn/itml.py b/metric_learn/itml.py
@@ -19,12 +19,12 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_array, check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import PairsMetricLearner, SupervisedMetricLearner
 from .constraints import Constraints
 from ._util import vector_norm
 
 
-class ITML(BaseMetricLearner):
+class ITML(PairsMetricLearner):
   """Information Theoretic Metric Learning (ITML)"""
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                A0=None, verbose=False):
@@ -140,7 +140,7 @@ def metric(self):
     return self.A_
 
 
-class ITML_Supervised(ITML):
+class ITML_Supervised(ITML, SupervisedMetricLearner):
   """Information Theoretic Metric Learning (ITML)"""
   def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3,
                num_labeled=np.inf, num_constraints=None, bounds=None, A0=None,

diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py
@@ -18,10 +18,10 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 
-class LFDA(BaseMetricLearner):
+class LFDA(SupervisedMetricLearner):
   '''
   Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction
   Sugiyama, ICML 2006

diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py
@@ -17,11 +17,11 @@
 from sklearn.utils.validation import check_X_y, check_array
 from sklearn.metrics import euclidean_distances
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 
 # commonality between LMNN implementations
-class _base_LMNN(BaseMetricLearner):
+class _base_LMNN(SupervisedMetricLearner):
   def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7,
                regularization=0.5, convergence_tol=0.001, use_pca=True,
                verbose=False):

diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py
@@ -13,11 +13,11 @@
 from six.moves import xrange
 from sklearn.utils.validation import check_array, check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import PairsMetricLearner, SupervisedMetricLearner
 from .constraints import Constraints
 
 
-class LSML(BaseMetricLearner):
+class LSML(PairsMetricLearner):
   def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False):
     """Initialize LSML.
 
@@ -131,7 +131,7 @@ def _gradient(self, metric):
     return dMetric
 
 
-class LSML_Supervised(LSML):
+class LSML_Supervised(LSML, SupervisedMetricLearner):
   def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf,
                num_constraints=None, weights=None, verbose=False):
     """Initialize the learner.

diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py
@@ -13,12 +13,12 @@
 from sklearn.decomposition import PCA
 from sklearn.utils.validation import check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 EPS = np.finfo(float).eps
 
 
-class MLKR(BaseMetricLearner):
+class MLKR(SupervisedMetricLearner):
   """Metric Learning for Kernel Regression (MLKR)"""
   def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001,
                max_iter=1000):

diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py
@@ -22,13 +22,13 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.utils.validation import check_array, check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import PairsMetricLearner, SupervisedMetricLearner
 from .constraints import Constraints
 from ._util import vector_norm
 
 
 
-class MMC(BaseMetricLearner):
+class MMC(PairsMetricLearner):
   """Mahalanobis Metric for Clustering (MMC)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3,
                A0=None, diagonal=False, diagonal_c=1.0, verbose=False):
@@ -380,7 +380,7 @@ def transformer(self):
       return V.T * np.sqrt(np.maximum(0, w[:,None]))
 
 
-class MMC_Supervised(MMC):
+class MMC_Supervised(MMC, SupervisedMetricLearner):
   """Mahalanobis Metric for Clustering (MMC)"""
   def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6,
                num_labeled=np.inf, num_constraints=None,

diff --git a/metric_learn/nca.py b/metric_learn/nca.py
@@ -8,12 +8,12 @@
 from six.moves import xrange
 from sklearn.utils.validation import check_X_y
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 
 EPS = np.finfo(float).eps
 
 
-class NCA(BaseMetricLearner):
+class NCA(SupervisedMetricLearner):
   def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01):
     self.num_dims = num_dims
     self.max_iter = max_iter

diff --git a/metric_learn/rca.py b/metric_learn/rca.py
@@ -18,7 +18,7 @@
 from sklearn import decomposition
 from sklearn.utils.validation import check_array
 
-from .base_metric import BaseMetricLearner
+from .base_metric import SupervisedMetricLearner
 from .constraints import Constraints
 
 
@@ -35,7 +35,7 @@ def _chunk_mean_centering(data, chunks):
   return chunk_mask, chunk_data
 
 
-class RCA(BaseMetricLearner):
+class RCA(SupervisedMetricLearner):
   """Relevant Components Analysis (RCA)"""
   def __init__(self, num_dims=None, pca_comps=None):
     """Initialize the learner.

diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py
@@ -15,11 +15,11 @@
 from sklearn.utils.extmath import pinvh
 from sklearn.utils.validation import check_array
 
-from .base_metric import BaseMetricLearner
+from .base_metric import PairsMetricLearner, SupervisedMetricLearner
 from .constraints import Constraints
 
 
-class SDML(BaseMetricLearner):
+class SDML(PairsMetricLearner):
   def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True,
                verbose=False):
     """
@@ -80,7 +80,7 @@ def fit(self, X, W):
     return self
 
 
-class SDML_Supervised(SDML):
+class SDML_Supervised(SDML, SupervisedMetricLearner):
   def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True,
                num_labeled=np.inf, num_constraints=None, verbose=False):
     """

diff --git a/test/test_constrained_dataset.py b/test/test_constrained_dataset.py
@@ -0,0 +1,74 @@
+import unittest
+import numpy as np
+from metric_learn.constraints import ConstrainedDataset
+from numpy.testing import assert_array_equal
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.utils.testing import assert_raise_message
+
+X = np.random.randn(20, 5)
+c = np.random.randint(0, X.shape[0], (15, 2))
+cd = ConstrainedDataset(X, c)
+y = np.random.randint(0, 2, c.shape[0])
+group = np.random.randint(0, 3, c.shape[0])
+
+c_shape = c.shape[0]
+
+
+class TestConstrainedDataset(unittest.TestCase):
+
+    @staticmethod
+    def check_indexing(idx):
+        # checks that an indexing returns the data we expect
+        np.testing.assert_array_equal(cd[idx].c, c[idx])
+        np.testing.assert_array_equal(cd[idx].toarray(), X[c[idx]])
+        np.testing.assert_array_equal(cd[idx].toarray(), X[c][idx])
+
+    def test_inputs(self):
+        # test the allowed and forbidden ways to create a ConstrainedDataset
+        ConstrainedDataset(X, c)
+        two_points = [[1, 2], [3, 5]]
+        out_of_range_constraints = [[1, 2], [0, 1]]
+        msg = "ConstrainedDataset cannot be created: the length of " \
+              "the dataset is 2, so index 2 is out of " \
+              "range."
+        assert_raise_message(IndexError, msg, ConstrainedDataset, two_points,
+                             out_of_range_constraints)
+
+    def test_getitem(self):
+        # test different types of slicing
+        i = np.random.randint(1, c_shape - 1)
+        begin = np.random.randint(1, c_shape - 1)
+        end = np.random.randint(begin + 1, c_shape)
+        fancy_index = np.random.randint(0, c_shape, 20)
+        binary_index = np.random.randint(0, 2, c_shape)
+        boolean_index = binary_index.astype(bool)
+        items = [0, c_shape - 1, i, slice(i), slice(0, begin), slice(begin,
+                 end), slice(end, c_shape), slice(0, c_shape), fancy_index,
+                 binary_index, boolean_index]
+        for item in items:
+            self.check_indexing(item)
+
+    def test_repr(self):
+        assert repr(cd) == repr(X[c])
+
+    def test_str(self):
+        assert str(cd) == str(X[c])
+
+    def test_shape(self):
+        assert cd.shape == (c.shape[0], X.shape[1])
+        assert cd[0, 0].shape == (0, X.shape[1])
+
+    def test_toarray(self):
+        assert_array_equal(cd.toarray(), cd.X[c])
+
+    def test_folding(self):
+        # test that ConstrainedDataset is compatible with scikit-learn folding
+        shuffle_list = [True, False]
+        groups_list = [group, None]
+        for alg in [KFold, StratifiedKFold]:
+            for shuffle_i in shuffle_list:
+                for group_i in groups_list:
+                    for train_idx, test_idx in alg(
+                            shuffle=shuffle_i).split(cd, y, group_i):
+                        self.check_indexing(train_idx)
+                        self.check_indexing(test_idx)