From 7b3d739ee968770112c55777a373d06ffc99a1d3 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 25 May 2018 16:25:58 +0200 Subject: [PATCH 001/120] WIP create MahalanobisMixin --- metric_learn/base_metric.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 02519de1..3221f4bd 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -1,6 +1,8 @@ from numpy.linalg import inv, cholesky from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_array +from abc import ABCMeta, abstractmethod +import six class BaseMetricLearner(BaseEstimator, TransformerMixin): @@ -49,3 +51,23 @@ def transform(self, X=None): X = check_array(X, accept_sparse=True) L = self.transformer() return X.dot(L.T) + + +class MahalanobisMixin(six.with_metaclass(ABCMeta)): + """Mahalanobis metric learning algorithms. + + Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`, + defined between two column vectors :math:`x` and :math:`x'` by: + :math:`d_M(x, x') = \sqrt{(x-x')^T M (x-x')}`, where :math:`M` is the + learned square matrix. + + Attributes + ---------- + metric_: `np.ndarray`, shape=(n_features, n_features) + The learned Mahalanobis matrix. + """ + + @property + @abstractmethod + def metric_(self): + pass From f21cc8516c9640f1bcb7229a5ebce14448c91644 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 25 May 2018 18:42:51 +0200 Subject: [PATCH 002/120] ENH Update algorithms with Mahalanobis Mixin: - Make them inherit from Mahalanobis Mixin, and implement the metric_ property - Improve metric_ property by checking if it exists and raising the appropriate warning if not - Make tests work, by replacing metric() with metric_ --- metric_learn/base_metric.py | 14 +------------- metric_learn/covariance.py | 10 ++++++---- metric_learn/itml.py | 19 +++++++++++++++---- metric_learn/lfda.py | 11 ++++++++--- metric_learn/lmnn.py | 11 ++++++++--- metric_learn/lsml.py | 19 +++++++++++++++---- metric_learn/mlkr.py | 21 ++++++++++++++++++--- metric_learn/mmc.py | 19 +++++++++++++++---- metric_learn/nca.py | 11 ++++++++--- metric_learn/rca.py | 11 ++++++++--- metric_learn/sdml.py | 10 ++++++---- test/metric_learn_test.py | 6 +++--- test/test_transformer_metric_conversion.py | 18 +++++++++--------- 13 files changed, 120 insertions(+), 60 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 3221f4bd..11ebe6e2 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -9,18 +9,6 @@ class BaseMetricLearner(BaseEstimator, TransformerMixin): def __init__(self): raise NotImplementedError('BaseMetricLearner should not be instantiated') - def metric(self): - """Computes the Mahalanobis matrix from the transformation matrix. - - .. math:: M = L^{\\top} L - - Returns - ------- - M : (d x d) matrix - """ - L = self.transformer() - return L.T.dot(L) - def transformer(self): """Computes the transformation matrix from the Mahalanobis matrix. @@ -30,7 +18,7 @@ def transformer(self): ------- L : upper triangular (d x d) matrix """ - return cholesky(self.metric()).T + return cholesky(self.metric_).T def transform(self, X=None): """Applies the metric transformation. diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 8fc07873..7a11a0c0 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -10,16 +10,18 @@ from __future__ import absolute_import import numpy as np -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_array, check_is_fitted -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin -class Covariance(BaseMetricLearner): +class Covariance(BaseMetricLearner, MahalanobisMixin): def __init__(self): pass - def metric(self): + @property + def metric_(self): + check_is_fitted(self, 'M_') return self.M_ def fit(self, X, y=None): diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 4d719591..e05327e0 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -18,13 +18,14 @@ from six.moves import xrange from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_array, check_X_y +from sklearn.exceptions import NotFittedError -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin from .constraints import Constraints, wrap_pairs from ._util import vector_norm -class ITML(BaseMetricLearner): +class ITML(BaseMetricLearner, MahalanobisMixin): """Information Theoretic Metric Learning (ITML)""" def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, A0=None, verbose=False): @@ -147,8 +148,18 @@ def fit(self, pairs, y, bounds=None): self.n_iter_ = it return self - def metric(self): - return self.A_ + @property + def metric_(self): + if hasattr(self, 'A_'): + return self.A_ # in this case the estimator is fitted + elif self.A0 is not None: + return check_array(self.A0) + else: # extracted from scikit-learn's check_is_fitted function + msg = ("This %(name)s instance is not fitted yet, and is neither " + "initialized with an explicit matrix. Call 'fit' with appropriate" + " arguments before using this method, or initialize the metric_ " + "with ``A0`` equals a matrix, not None.") + raise NotFittedError(msg % {'name': type(self).__name__}) class ITML_Supervised(ITML): diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 809f092b..ac859e60 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -16,12 +16,12 @@ import warnings from six.moves import xrange from sklearn.metrics import pairwise_distances -from sklearn.utils.validation import check_X_y +from sklearn.utils.validation import check_X_y, check_is_fitted -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin -class LFDA(BaseMetricLearner): +class LFDA(BaseMetricLearner, MahalanobisMixin): ''' Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction Sugiyama, ICML 2006 @@ -54,6 +54,11 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted'): def transformer(self): return self.transformer_ + @property + def metric_(self): + check_is_fitted(self, 'transformer_') + return self.transformer_.T.dot(self.transformer_) + def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) self.X_, y = check_X_y(X, y) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index dea12f0c..a5928584 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -14,14 +14,14 @@ import warnings from collections import Counter from six.moves import xrange -from sklearn.utils.validation import check_X_y, check_array +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.metrics import euclidean_distances -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin # commonality between LMNN implementations -class _base_LMNN(BaseMetricLearner): +class _base_LMNN(BaseMetricLearner, MahalanobisMixin): def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, use_pca=True, verbose=False): @@ -47,6 +47,11 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, def transformer(self): return self.L_ + @property + def metric_(self): + check_is_fitted(self, 'L_') + return self.L_.T.dot(self.L_) + # slower Python version class python_LMNN(_base_LMNN): diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index b8b69f19..701eb84b 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -11,13 +11,14 @@ import numpy as np import scipy.linalg from six.moves import xrange +from sklearn.exceptions import NotFittedError from sklearn.utils.validation import check_array, check_X_y -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin from .constraints import Constraints, wrap_pairs -class LSML(BaseMetricLearner): +class LSML(BaseMetricLearner, MahalanobisMixin): def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False): """Initialize LSML. @@ -57,8 +58,18 @@ def _prepare_quadruplets(self, quadruplets, weights): self.M_ = self.prior self.prior_inv_ = np.linalg.inv(self.prior) - def metric(self): - return self.M_ + @property + def metric_(self): + if hasattr(self, 'M_'): + return self.M_ # in this case the estimator is fitted + elif self.prior is not None: + return check_array(self.prior) + else: # extracted from scikit-learn's check_is_fitted function + msg = ("This %(name)s instance is not fitted yet, and is neither " + "initialized with an explicit matrix. Call 'fit' with appropriate" + " arguments before using this method, or initialize the metric_ " + "with ``prior`` equals a matrix, not None.") + raise NotFittedError(msg % {'name': type(self).__name__}) def fit(self, quadruplets, weights=None): """Learn the LSML model. diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 35b80495..5a2032ab 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -11,14 +11,15 @@ from scipy.optimize import minimize from scipy.spatial.distance import pdist, squareform from sklearn.decomposition import PCA -from sklearn.utils.validation import check_X_y +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_X_y, check_array -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin EPS = np.finfo(float).eps -class MLKR(BaseMetricLearner): +class MLKR(BaseMetricLearner, MahalanobisMixin): """Metric Learning for Kernel Regression (MLKR)""" def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, max_iter=1000): @@ -93,6 +94,20 @@ def fit(self, X, y): def transformer(self): return self.transformer_ + @property + def metric_(self): + if hasattr(self, 'transformer_'): + return self.transformer_.T.dot(self.transformer_) # in this case the + # estimator is fitted + elif self.A0 is not None: + return check_array(self.A0.T.dot(self.A0)) + else: # extracted from scikit-learn's check_is_fitted function + msg = ("This %(name)s instance is not fitted yet, and is neither " + "initialized with an explicit matrix. Call 'fit' with appropriate" + " arguments before using this method, or initialize the metric_ " + "with ``A0`` equals a matrix, not None.") + raise NotFittedError(msg % {'name': type(self).__name__}) + def _loss(flatA, X, y, dX): A = flatA.reshape((-1, X.shape[1])) diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index a72fa14b..737422df 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -19,16 +19,17 @@ from __future__ import print_function, absolute_import, division import numpy as np from six.moves import xrange +from sklearn.exceptions import NotFittedError from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_array, check_X_y -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin from .constraints import Constraints, wrap_pairs from ._util import vector_norm -class MMC(BaseMetricLearner): +class MMC(BaseMetricLearner, MahalanobisMixin): """Mahalanobis Metric for Clustering (MMC)""" def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, A0=None, diagonal=False, diagonal_c=1.0, verbose=False): @@ -366,8 +367,18 @@ def _D_constraint(self, neg_pairs, w): sum_deri2 / sum_dist - np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist) ) - def metric(self): - return self.A_ + @property + def metric_(self): + if hasattr(self, 'A_'): + return self.A_ # in this case the estimator is fitted + elif self.A0 is not None: + return check_array(self.A0) + else: # extracted from scikit-learn's check_is_fitted function + msg = ("This %(name)s instance is not fitted yet, and is neither " + "initialized with an explicit matrix. Call 'fit' with appropriate" + " arguments before using this method, or initialize the metric_ " + "with ``A0`` equals a matrix, not None.") + raise NotFittedError(msg % {'name': type(self).__name__}) def transformer(self): """Computes the transformation matrix from the Mahalanobis matrix. diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 40757d23..e1c70660 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -6,14 +6,14 @@ from __future__ import absolute_import import numpy as np from six.moves import xrange -from sklearn.utils.validation import check_X_y +from sklearn.utils.validation import check_X_y, check_is_fitted -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin EPS = np.finfo(float).eps -class NCA(BaseMetricLearner): +class NCA(BaseMetricLearner, MahalanobisMixin): def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01): self.num_dims = num_dims self.max_iter = max_iter @@ -22,6 +22,11 @@ def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01): def transformer(self): return self.A_ + @property + def metric_(self): + check_is_fitted(self, 'A_') + return self.A_.T.dot(self.A_) + def fit(self, X, y): """ X: data matrix, (n x d) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 0d9b3620..d9714bfd 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -16,9 +16,9 @@ import warnings from six.moves import xrange from sklearn import decomposition -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_array, check_is_fitted -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin from .constraints import Constraints @@ -35,7 +35,7 @@ def _chunk_mean_centering(data, chunks): return chunk_mask, chunk_data -class RCA(BaseMetricLearner): +class RCA(BaseMetricLearner, MahalanobisMixin): """Relevant Components Analysis (RCA)""" def __init__(self, num_dims=None, pca_comps=None): """Initialize the learner. @@ -58,6 +58,11 @@ def __init__(self, num_dims=None, pca_comps=None): def transformer(self): return self.transformer_ + @property + def metric_(self): + check_is_fitted(self, 'transformer_') + return self.transformer_.T.dot(self.transformer_) + def _process_data(self, X): self.X_ = X = check_array(X) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 19919ab1..2624e52d 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -13,13 +13,13 @@ from scipy.sparse.csgraph import laplacian from sklearn.covariance import graph_lasso from sklearn.utils.extmath import pinvh -from sklearn.utils.validation import check_array, check_X_y +from sklearn.utils.validation import check_array, check_X_y, check_is_fitted -from .base_metric import BaseMetricLearner +from .base_metric import BaseMetricLearner, MahalanobisMixin from .constraints import Constraints, wrap_pairs -class SDML(BaseMetricLearner): +class SDML(BaseMetricLearner, MahalanobisMixin): def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, verbose=False): """ @@ -54,7 +54,9 @@ def _prepare_pairs(self, pairs, y): diff = pairs[:, 0] - pairs[:, 1] return (diff.T * y).dot(diff) - def metric(self): + @property + def metric_(self): + check_is_fitted(self, 'M_') return self.M_ def fit(self, pairs, y): diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 1756b105..9bcf5c1a 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -114,7 +114,7 @@ def test_iris(self): self.assertLess(csep, 0.15) # Sanity checks for learned matrices. - self.assertEqual(lfda.metric().shape, (4, 4)) + self.assertEqual(lfda.metric_.shape, (4, 4)) self.assertEqual(lfda.transformer().shape, (2, 4)) @@ -166,13 +166,13 @@ def test_iris(self): [+0.00083371, +0.00149466, -0.00200719, -0.00296284], [-0.00111959, -0.00200719, +0.00269546, +0.00397881], [-0.00165265, -0.00296284, +0.00397881, +0.00587320]] - assert_array_almost_equal(expected, mmc.metric(), decimal=6) + assert_array_almost_equal(expected, mmc.metric_, decimal=6) # Diagonal metric mmc = MMC(diagonal=True) mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d])) expected = [0, 0, 1.21045968, 1.22552608] - assert_array_almost_equal(np.diag(expected), mmc.metric(), decimal=6) + assert_array_almost_equal(np.diag(expected), mmc.metric_, decimal=6) # Supervised Full mmc = MMC_Supervised() diff --git a/test/test_transformer_metric_conversion.py b/test/test_transformer_metric_conversion.py index e027d176..981009d9 100644 --- a/test/test_transformer_metric_conversion.py +++ b/test/test_transformer_metric_conversion.py @@ -20,60 +20,60 @@ def test_cov(self): cov = Covariance() cov.fit(self.X) L = cov.transformer() - assert_array_almost_equal(L.T.dot(L), cov.metric()) + assert_array_almost_equal(L.T.dot(L), cov.metric_) def test_lsml_supervised(self): seed = np.random.RandomState(1234) lsml = LSML_Supervised(num_constraints=200) lsml.fit(self.X, self.y, random_state=seed) L = lsml.transformer() - assert_array_almost_equal(L.T.dot(L), lsml.metric()) + assert_array_almost_equal(L.T.dot(L), lsml.metric_) def test_itml_supervised(self): seed = np.random.RandomState(1234) itml = ITML_Supervised(num_constraints=200) itml.fit(self.X, self.y, random_state=seed) L = itml.transformer() - assert_array_almost_equal(L.T.dot(L), itml.metric()) + assert_array_almost_equal(L.T.dot(L), itml.metric_) def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.transformer() - assert_array_almost_equal(L.T.dot(L), lmnn.metric()) + assert_array_almost_equal(L.T.dot(L), lmnn.metric_) def test_sdml_supervised(self): seed = np.random.RandomState(1234) sdml = SDML_Supervised(num_constraints=1500) sdml.fit(self.X, self.y, random_state=seed) L = sdml.transformer() - assert_array_almost_equal(L.T.dot(L), sdml.metric()) + assert_array_almost_equal(L.T.dot(L), sdml.metric_) def test_nca(self): n = self.X.shape[0] nca = NCA(max_iter=(100000//n), learning_rate=0.01) nca.fit(self.X, self.y) L = nca.transformer() - assert_array_almost_equal(L.T.dot(L), nca.metric()) + assert_array_almost_equal(L.T.dot(L), nca.metric_) def test_lfda(self): lfda = LFDA(k=2, num_dims=2) lfda.fit(self.X, self.y) L = lfda.transformer() - assert_array_almost_equal(L.T.dot(L), lfda.metric()) + assert_array_almost_equal(L.T.dot(L), lfda.metric_) def test_rca_supervised(self): seed = np.random.RandomState(1234) rca = RCA_Supervised(num_dims=2, num_chunks=30, chunk_size=2) rca.fit(self.X, self.y, random_state=seed) L = rca.transformer() - assert_array_almost_equal(L.T.dot(L), rca.metric()) + assert_array_almost_equal(L.T.dot(L), rca.metric_) def test_mlkr(self): mlkr = MLKR(num_dims=2) mlkr.fit(self.X, self.y) L = mlkr.transformer() - assert_array_almost_equal(L.T.dot(L), mlkr.metric()) + assert_array_almost_equal(L.T.dot(L), mlkr.metric_) if __name__ == '__main__': From f9e3c829c02200e5f3943dcfeecb346418893d93 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 11 Jun 2018 13:07:14 +0200 Subject: [PATCH 003/120] FIX: add missing import --- metric_learn/sdml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 7cf04ae1..077aee31 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -17,7 +17,7 @@ check_is_fitted) from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer) + MetricTransformer, _PairsClassifierMixin) from .constraints import Constraints, wrap_pairs From 1a32c11a3a8c07d7006505159498350ee5e0e841 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 11 Jun 2018 14:23:38 +0200 Subject: [PATCH 004/120] FIX: update sklearn's function check_no_fit_attributes_set_in_init to new check_no_attributes_set_in_init" This new function was introduced through PR https://github.com/scikit-learn/scikit-learn/pull/9450 in scikit-learn. It also allows to pass tests that would otherwise not pass: indeed having abstract attributes as properties threw an error. But the new test functions handles well this property inheritance. --- test/test_weakly_supervised.py | 44 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 6386d22a..c03ce4d4 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -4,7 +4,7 @@ from sklearn.utils import shuffle, check_random_state from sklearn.utils.estimator_checks import is_public_parameter from sklearn.utils.testing import (assert_allclose_dense_sparse, - set_random_state) + set_random_state, _get_args) from metric_learn import ITML, MMC, SDML, LSML from metric_learn.constraints import wrap_pairs, Constraints @@ -99,22 +99,32 @@ def test_simple_estimator(estimator, build_dataset): @pytest.mark.parametrize('estimator', [est[0] for est in list_estimators], ids=ids_estimators) -def test_no_fit_attributes_set_in_init(estimator): - """Check that Estimator.__init__ doesn't set trailing-_ attributes.""" - # From scikit-learn - estimator = clone(estimator) - for attr in dir(estimator): - if attr.endswith("_") and not attr.startswith("__"): - # This check is for properties, they can be listed in dir - # while at the same time have hasattr return False as long - # as the property getter raises an AttributeError - assert hasattr(estimator, attr), \ - ("By convention, attributes ending with '_' are " - "estimated from data in scikit-learn. Consequently they " - "should not be initialized in the constructor of an " - "estimator but in the fit method. Attribute {!r} " - "was found in estimator {}".format( - attr, type(estimator).__name__)) +def test_no_attributes_set_in_init(estimator): + """Check setting during init. Taken from scikit-learn.""" + estimator = clone(estimator) + if hasattr(type(estimator).__init__, "deprecated_original"): + return + + init_params = _get_args(type(estimator).__init__) + parents_init_params = [param for params_parent in + (_get_args(parent) for parent in + type(estimator).__mro__) + for param in params_parent] + + # Test for no setting apart from parameters during init + invalid_attr = (set(vars(estimator)) - set(init_params) - + set(parents_init_params)) + assert not invalid_attr, \ + ("Estimator %s should not set any attribute apart" + " from parameters during init. Found attributes %s." + % (type(estimator).__name__, sorted(invalid_attr))) + # Ensure that each parameter is set in init + invalid_attr = (set(init_params) - set(vars(estimator)) - + set(["self"])) + assert not invalid_attr, \ + ("Estimator %s should store all parameters" + " as an attribute during init. Did not find " + "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, From d0f50195f4dbabf24480c715636e1be87b416452 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 11 Jun 2018 16:03:07 +0200 Subject: [PATCH 005/120] FIX: take function ``_get_args`` from scikit-learn's PR https://github.com/scikit-learn/scikit-learn/pull/9450 Indeed, in the PR this function is modified to support python 2. This should solve the CI error. --- test/test_weakly_supervised.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index c03ce4d4..146df297 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -4,7 +4,8 @@ from sklearn.utils import shuffle, check_random_state from sklearn.utils.estimator_checks import is_public_parameter from sklearn.utils.testing import (assert_allclose_dense_sparse, - set_random_state, _get_args) + set_random_state) +from sklearn.utils.fixes import signature from metric_learn import ITML, MMC, SDML, LSML from metric_learn.constraints import wrap_pairs, Constraints @@ -222,3 +223,23 @@ def test_dont_overwrite_parameters(estimator, build_dataset): " to change attributes started" " or ended with _, but" " %s changed" % ', '.join(attrs_changed_by_fit)) + + +def _get_args(function, varargs=False): + """Helper to get function arguments""" + + try: + params = signature(function).parameters + except ValueError: + # Error on builtin C function + return [] + args = [key for key, param in params.items() + if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] + if varargs: + varargs = [param.name for param in params.values() + if param.kind == param.VAR_POSITIONAL] + if len(varargs) == 0: + varargs = None + return args, varargs + else: + return args From eba2a600c7b6c1120388ed366a73b88122974baa Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 14 Jun 2018 17:08:23 +0200 Subject: [PATCH 006/120] ENH: add transformer_ attribute and improve docstring --- metric_learn/base_metric.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 28339b68..532547b8 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -50,14 +50,21 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta)): """Mahalanobis metric learning algorithms. Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`, - defined between two column vectors :math:`x` and :math:`x'` by: - :math:`d_M(x, x') = \sqrt{(x-x')^T M (x-x')}`, where :math:`M` is the - learned square matrix. + defined between two column vectors :math:`x` and :math:`x'` by: :math:`d_M(x, + x') = \sqrt{(x-x')^T M (x-x')}`, where :math:`M` is a learned symmetric + positive semi-definite (PSD) matrix. The metric between points can then be + expressed as the euclidean distance between points embedded in a new space + through a linear transformation. Indeed, the above matrix can be decomposed + into the product of two transpose matrices (through SVD or Cholesky + decomposition): :math:`d_M(x, x')^2 = (x-x')^T M (x-x') = (x-x')^T L^T L + (x-x') = (L x - L x')^T (L x- L x')` Attributes ---------- - metric_: `np.ndarray`, shape=(n_features, n_features) - The learned Mahalanobis matrix. + metric_: `np.ndarray`, shape=(n_features_out, n_features) + The learned metric ``M``. + transformer_: `np.ndarray`, shape=(n_features_out, n_features) + The learned linear transformation ``L``. """ @property @@ -65,6 +72,11 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta)): def metric_(self): pass + @property + @abstractmethod + def transformer_(self): + pass + class _PairsClassifierMixin: From b5d966f9c39a594fd3a381a83cd6454b8ac92c6e Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 18 Jun 2018 11:56:43 +0200 Subject: [PATCH 007/120] WIP: move transform() in BaseMetricLearner to transformer_from_metric() in MahalanobisMixin --- metric_learn/base_metric.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 532547b8..54c5cfa6 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -11,17 +11,6 @@ class BaseMetricLearner(BaseEstimator): def __init__(self): raise NotImplementedError('BaseMetricLearner should not be instantiated') - def transformer(self): - """Computes the transformation matrix from the Mahalanobis matrix. - - L = cholesky(M).T - - Returns - ------- - L : upper triangular (d x d) matrix - """ - return cholesky(self.metric_).T - class MetricTransformer(TransformerMixin): @@ -77,6 +66,17 @@ def metric_(self): def transformer_(self): pass + def transformer_from_metric(self, metric): + """Computes the transformation matrix from the Mahalanobis matrix. + + L = cholesky(M).T + + Returns + ------- + L : upper triangular (d x d) matrix + """ + return cholesky(metric).T + class _PairsClassifierMixin: From ee0d1bddd1bbc7093b59c6eda18f2476945bcace Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 18 Jun 2018 12:04:00 +0200 Subject: [PATCH 008/120] WIP: refactor metric to original formulation: a function, with result computed from the transformer --- metric_learn/base_metric.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 54c5cfa6..40aafdfc 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -50,21 +50,12 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta)): Attributes ---------- - metric_: `np.ndarray`, shape=(n_features_out, n_features) - The learned metric ``M``. transformer_: `np.ndarray`, shape=(n_features_out, n_features) The learned linear transformation ``L``. """ - @property - @abstractmethod - def metric_(self): - pass - - @property - @abstractmethod - def transformer_(self): - pass + def metric(self): + return self.transformer_.T.dot(self.transformer_) def transformer_from_metric(self, metric): """Computes the transformation matrix from the Mahalanobis matrix. From 6b5a3b59cb24bf1ea4db46c41f70e65271931841 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 19 Jun 2018 11:39:36 +0200 Subject: [PATCH 009/120] WIP: make all Mahalanobis Metric Learner algorithms have transformer_ and metric() --- metric_learn/base_metric.py | 8 ++--- metric_learn/covariance.py | 7 ++--- metric_learn/itml.py | 15 ++------- metric_learn/lfda.py | 8 ----- metric_learn/lmnn.py | 14 ++------- metric_learn/lsml.py | 15 ++------- metric_learn/mlkr.py | 17 ---------- metric_learn/mmc.py | 23 +++++--------- metric_learn/nca.py | 10 +----- metric_learn/rca.py | 8 ----- metric_learn/sdml.py | 7 ++--- test/metric_learn_test.py | 10 +++--- test/test_transformer_metric_conversion.py | 36 +++++++++++----------- 13 files changed, 46 insertions(+), 132 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 40aafdfc..9e056304 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -31,7 +31,7 @@ def transform(self, X=None): X = self.X_ else: X = check_array(X, accept_sparse=True) - L = self.transformer() + L = self.transformer_ return X.dot(L.T) @@ -88,7 +88,7 @@ def predict(self, pairs): The predicted learned metric value between samples in every pair. """ pairwise_diffs = pairs[:, 0, :] - pairs[:, 1, :] - return np.sqrt(np.sum(pairwise_diffs.dot(self.metric_) * pairwise_diffs, + return np.sqrt(np.sum(pairwise_diffs.dot(self.metric()) * pairwise_diffs, axis=1)) def decision_function(self, pairs): @@ -140,9 +140,9 @@ def predict(self, quadruplets): """ similar_diffs = quadruplets[:, 0, :] - quadruplets[:, 1, :] dissimilar_diffs = quadruplets[:, 2, :] - quadruplets[:, 3, :] - return (np.sqrt(np.sum(similar_diffs.dot(self.metric_) * + return (np.sqrt(np.sum(similar_diffs.dot(self.metric()) * similar_diffs, axis=1)) - - np.sqrt(np.sum(dissimilar_diffs.dot(self.metric_) * + np.sqrt(np.sum(dissimilar_diffs.dot(self.metric()) * dissimilar_diffs, axis=1))) def decision_function(self, quadruplets): diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index a9c26292..0b6e862c 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -21,11 +21,6 @@ class Covariance(BaseMetricLearner, MetricTransformer, def __init__(self): pass - @property - def metric_(self): - check_is_fitted(self, 'M_') - return self.M_ - def fit(self, X, y=None): """ X : data matrix, (n x d) @@ -37,4 +32,6 @@ def fit(self, X, y=None): self.M_ = 1./self.M_ else: self.M_ = np.linalg.inv(self.M_) + + self.transformer_ = self.transformer_from_metric(check_array(self.M_)) return self diff --git a/metric_learn/itml.py b/metric_learn/itml.py index b84d1c60..a068f6f9 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -130,20 +130,9 @@ def _fit(self, pairs, y, bounds=None): if self.verbose: print('itml converged at iter: %d, conv = %f' % (it, conv)) self.n_iter_ = it - return self - @property - def metric_(self): - if hasattr(self, 'A_'): - return self.A_ # in this case the estimator is fitted - elif self.A0 is not None: - return check_array(self.A0) - else: # extracted from scikit-learn's check_is_fitted function - msg = ("This %(name)s instance is not fitted yet, and is neither " - "initialized with an explicit matrix. Call 'fit' with appropriate" - " arguments before using this method, or initialize the metric_ " - "with ``A0`` equals a matrix, not None.") - raise NotFittedError(msg % {'name': type(self).__name__}) + self.transformer_ = self.transformer_from_metric(self.A_) + return self class ITML(_BaseITML, _PairsClassifierMixin): diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 0276b400..8d3a1e22 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -53,14 +53,6 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted'): self.embedding_type = embedding_type self.k = k - def transformer(self): - return self.transformer_ - - @property - def metric_(self): - check_is_fitted(self, 'transformer_') - return self.transformer_.T.dot(self.transformer_) - def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) self.X_, y = check_X_y(X, y) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index e2a9a048..2f9985cf 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -46,14 +46,6 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, self.use_pca = use_pca self.verbose = verbose - def transformer(self): - return self.L_ - - @property - def metric_(self): - check_is_fitted(self, 'L_') - return self.L_.T.dot(self.L_) - # slower Python version class python_LMNN(_base_LMNN): @@ -67,7 +59,7 @@ def _process_inputs(self, X, labels): self.labels_ = np.arange(len(unique_labels)) if self.use_pca: warnings.warn('use_pca does nothing for the python_LMNN implementation') - self.L_ = np.eye(num_dims) + self.transformer_ = np.eye(num_dims) required_k = np.bincount(self.label_inds_).min() if self.k > required_k: raise ValueError('not enough class labels for specified k' @@ -99,7 +91,7 @@ def fit(self, X, y): # initialize gradient and L G = dfG * reg + df * (1-reg) - L = self.L_ + L = self.transformer_ objective = np.inf # main loop @@ -184,7 +176,7 @@ def fit(self, X, y): print("LMNN didn't converge in %d steps." % self.max_iter) # store the last L - self.L_ = L + self.transformer_ = L self.n_iter_ = it return self diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index f6f230d9..27b1704f 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -59,19 +59,6 @@ def _prepare_quadruplets(self, quadruplets, weights): self.M_ = self.prior self.prior_inv_ = np.linalg.inv(self.prior) - @property - def metric_(self): - if hasattr(self, 'M_'): - return self.M_ # in this case the estimator is fitted - elif self.prior is not None: - return check_array(self.prior) - else: # extracted from scikit-learn's check_is_fitted function - msg = ("This %(name)s instance is not fitted yet, and is neither " - "initialized with an explicit matrix. Call 'fit' with appropriate" - " arguments before using this method, or initialize the metric_ " - "with ``prior`` equals a matrix, not None.") - raise NotFittedError(msg % {'name': type(self).__name__}) - def _fit(self, quadruplets, weights=None): self._prepare_quadruplets(quadruplets, weights) step_sizes = np.logspace(-10, 0, 10) @@ -107,6 +94,8 @@ def _fit(self, quadruplets, weights=None): if self.verbose: print("Didn't converge after", it, "iterations. Final loss:", s_best) self.n_iter_ = it + + self.transformer_ = self.transformer_from_metric(self.M_) return self def _comparison_loss(self, metric): diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 26f493ef..cebce9df 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -93,23 +93,6 @@ def fit(self, X, y): self.n_iter_ = res.nit return self - def transformer(self): - return self.transformer_ - - @property - def metric_(self): - if hasattr(self, 'transformer_'): - return self.transformer_.T.dot(self.transformer_) # in this case the - # estimator is fitted - elif self.A0 is not None: - return check_array(self.A0.T.dot(self.A0)) - else: # extracted from scikit-learn's check_is_fitted function - msg = ("This %(name)s instance is not fitted yet, and is neither " - "initialized with an explicit matrix. Call 'fit' with appropriate" - " arguments before using this method, or initialize the metric_ " - "with ``A0`` equals a matrix, not None.") - raise NotFittedError(msg % {'name': type(self).__name__}) - def _loss(flatA, X, y, dX): A = flatA.reshape((-1, X.shape[1])) diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 98414fb2..19562282 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -215,6 +215,8 @@ def _fit_full(self, pairs, y): print('mmc converged at iter %d, conv = %f' % (cycle, delta)) self.A_[:] = A_old self.n_iter_ = cycle + + self.transformer_ = self.transformer_from_metric(self.A_) return self def _fit_diag(self, pairs, y): @@ -273,6 +275,8 @@ def _fit_diag(self, pairs, y): it += 1 self.A_ = np.diag(w) + + self.transformer_ = self.transformer_from_metric(self.A_) return self def _fD(self, neg_pairs, A): @@ -352,20 +356,7 @@ def _D_constraint(self, neg_pairs, w): sum_deri2 / sum_dist - np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist) ) - @property - def metric_(self): - if hasattr(self, 'A_'): - return self.A_ # in this case the estimator is fitted - elif self.A0 is not None: - return check_array(self.A0) - else: # extracted from scikit-learn's check_is_fitted function - msg = ("This %(name)s instance is not fitted yet, and is neither " - "initialized with an explicit matrix. Call 'fit' with appropriate" - " arguments before using this method, or initialize the metric_ " - "with ``A0`` equals a matrix, not None.") - raise NotFittedError(msg % {'name': type(self).__name__}) - - def transformer(self): + def transformer_from_metric(self, metric): """Computes the transformation matrix from the Mahalanobis matrix. L = V.T * w^(-1/2), with A = V*w*V.T being the eigenvector decomposition of A with the eigenvalues in the diagonal matrix w and the columns of V being the eigenvectors. @@ -378,9 +369,9 @@ def transformer(self): L : (d x d) matrix """ if self.diagonal: - return np.sqrt(self.A_) + return np.sqrt(metric) else: - w, V = np.linalg.eigh(self.A_) + w, V = np.linalg.eigh(metric) return V.T * np.sqrt(np.maximum(0, w[:,None])) diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 61c0d1dc..70c40fb9 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -21,14 +21,6 @@ def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01): self.max_iter = max_iter self.learning_rate = learning_rate - def transformer(self): - return self.A_ - - @property - def metric_(self): - check_is_fitted(self, 'A_') - return self.A_.T.dot(self.A_) - def fit(self, X, y): """ X: data matrix, (n x d) @@ -61,6 +53,6 @@ def fit(self, X, y): A += self.learning_rate * A.dot(d) self.X_ = X - self.A_ = A + self.transformer_ = A self.n_iter_ = it return self diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 77d18909..a90a53dd 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -57,14 +57,6 @@ def __init__(self, num_dims=None, pca_comps=None): self.num_dims = num_dims self.pca_comps = pca_comps - def transformer(self): - return self.transformer_ - - @property - def metric_(self): - check_is_fitted(self, 'transformer_') - return self.transformer_.T.dot(self.transformer_) - def _process_data(self, X): self.X_ = X = check_array(X) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 077aee31..f72bf2e7 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -56,11 +56,6 @@ def _prepare_pairs(self, pairs, y): diff = pairs[:, 0] - pairs[:, 1] return (diff.T * y).dot(diff) - @property - def metric_(self): - check_is_fitted(self, 'M_') - return self.M_ - def _fit(self, pairs, y): loss_matrix = self._prepare_pairs(pairs, y) P = self.M_ + self.balance_param * loss_matrix @@ -68,6 +63,8 @@ def _fit(self, pairs, y): # hack: ensure positive semidefinite emp_cov = emp_cov.T.dot(emp_cov) _, self.M_ = graph_lasso(emp_cov, self.sparsity_param, verbose=self.verbose) + + self.transformer_ = self.transformer_from_metric(self.M_) return self diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 9bcf5c1a..2f552902 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -97,7 +97,7 @@ def test_iris(self): [+0.2532, 0.5835, -0.8461, -0.8915], [-0.729, -0.6386, 1.767, 1.832], [-0.9405, -0.8461, 2.281, 2.794]] - assert_array_almost_equal(expected, nca.transformer(), decimal=3) + assert_array_almost_equal(expected, nca.transformer_, decimal=3) # With dimension reduction nca = NCA(max_iter=(100000//n), learning_rate=0.01, num_dims=2) @@ -114,8 +114,8 @@ def test_iris(self): self.assertLess(csep, 0.15) # Sanity checks for learned matrices. - self.assertEqual(lfda.metric_.shape, (4, 4)) - self.assertEqual(lfda.transformer().shape, (2, 4)) + self.assertEqual(lfda.metric().shape, (4, 4)) + self.assertEqual(lfda.transformer_.shape, (2, 4)) class TestRCA(MetricTestCase): @@ -166,13 +166,13 @@ def test_iris(self): [+0.00083371, +0.00149466, -0.00200719, -0.00296284], [-0.00111959, -0.00200719, +0.00269546, +0.00397881], [-0.00165265, -0.00296284, +0.00397881, +0.00587320]] - assert_array_almost_equal(expected, mmc.metric_, decimal=6) + assert_array_almost_equal(expected, mmc.metric(), decimal=6) # Diagonal metric mmc = MMC(diagonal=True) mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d])) expected = [0, 0, 1.21045968, 1.22552608] - assert_array_almost_equal(np.diag(expected), mmc.metric_, decimal=6) + assert_array_almost_equal(np.diag(expected), mmc.metric(), decimal=6) # Supervised Full mmc = MMC_Supervised() diff --git a/test/test_transformer_metric_conversion.py b/test/test_transformer_metric_conversion.py index 981009d9..3b8f9e0e 100644 --- a/test/test_transformer_metric_conversion.py +++ b/test/test_transformer_metric_conversion.py @@ -19,61 +19,61 @@ def setUpClass(self): def test_cov(self): cov = Covariance() cov.fit(self.X) - L = cov.transformer() - assert_array_almost_equal(L.T.dot(L), cov.metric_) + L = cov.transformer_ + assert_array_almost_equal(L.T.dot(L), cov.metric()) def test_lsml_supervised(self): seed = np.random.RandomState(1234) lsml = LSML_Supervised(num_constraints=200) lsml.fit(self.X, self.y, random_state=seed) - L = lsml.transformer() - assert_array_almost_equal(L.T.dot(L), lsml.metric_) + L = lsml.transformer_ + assert_array_almost_equal(L.T.dot(L), lsml.metric()) def test_itml_supervised(self): seed = np.random.RandomState(1234) itml = ITML_Supervised(num_constraints=200) itml.fit(self.X, self.y, random_state=seed) - L = itml.transformer() - assert_array_almost_equal(L.T.dot(L), itml.metric_) + L = itml.transformer_ + assert_array_almost_equal(L.T.dot(L), itml.metric()) def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) - L = lmnn.transformer() - assert_array_almost_equal(L.T.dot(L), lmnn.metric_) + L = lmnn.transformer_ + assert_array_almost_equal(L.T.dot(L), lmnn.metric()) def test_sdml_supervised(self): seed = np.random.RandomState(1234) sdml = SDML_Supervised(num_constraints=1500) sdml.fit(self.X, self.y, random_state=seed) - L = sdml.transformer() - assert_array_almost_equal(L.T.dot(L), sdml.metric_) + L = sdml.transformer_ + assert_array_almost_equal(L.T.dot(L), sdml.metric()) def test_nca(self): n = self.X.shape[0] nca = NCA(max_iter=(100000//n), learning_rate=0.01) nca.fit(self.X, self.y) - L = nca.transformer() - assert_array_almost_equal(L.T.dot(L), nca.metric_) + L = nca.transformer_ + assert_array_almost_equal(L.T.dot(L), nca.metric()) def test_lfda(self): lfda = LFDA(k=2, num_dims=2) lfda.fit(self.X, self.y) - L = lfda.transformer() - assert_array_almost_equal(L.T.dot(L), lfda.metric_) + L = lfda.transformer_ + assert_array_almost_equal(L.T.dot(L), lfda.metric()) def test_rca_supervised(self): seed = np.random.RandomState(1234) rca = RCA_Supervised(num_dims=2, num_chunks=30, chunk_size=2) rca.fit(self.X, self.y, random_state=seed) - L = rca.transformer() - assert_array_almost_equal(L.T.dot(L), rca.metric_) + L = rca.transformer_ + assert_array_almost_equal(L.T.dot(L), rca.metric()) def test_mlkr(self): mlkr = MLKR(num_dims=2) mlkr.fit(self.X, self.y) - L = mlkr.transformer() - assert_array_almost_equal(L.T.dot(L), mlkr.metric_) + L = mlkr.transformer_ + assert_array_almost_equal(L.T.dot(L), mlkr.metric()) if __name__ == '__main__': From 6eb65ac74cb252f098affc86d2d38481a8b64460 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 25 Jun 2018 17:37:37 +0200 Subject: [PATCH 010/120] ENH Add score_pairs function - Make MahalanobisMixin inherit from BaseMetricLearner to give a concrete implementation of score_pairs - Use score_pairs to compute more easily predict - Add docstring - TST: for every algorithm: - test that using score_pairs pairwise returns an euclidean distance matrix - test that score_pairs works for 3D arrays of several pairs as well as 2D arrays of one pair (and there returns only a scalar) - test that score_pairs always returns a finite output --- metric_learn/base_metric.py | 61 ++++++++++++---- metric_learn/covariance.py | 8 +-- metric_learn/itml.py | 8 +-- metric_learn/lfda.py | 8 +-- metric_learn/lmnn.py | 8 +-- metric_learn/lsml.py | 10 +-- metric_learn/mlkr.py | 10 ++- metric_learn/mmc.py | 9 ++- metric_learn/nca.py | 8 +-- metric_learn/rca.py | 8 +-- metric_learn/sdml.py | 10 ++- test/test_mahalanobis_mixin.py | 127 +++++++++++++++++++++++++++++++++ 12 files changed, 210 insertions(+), 65 deletions(-) create mode 100644 test/test_mahalanobis_mixin.py diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 9e056304..e9bc1753 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -11,6 +11,22 @@ class BaseMetricLearner(BaseEstimator): def __init__(self): raise NotImplementedError('BaseMetricLearner should not be instantiated') + @abstractmethod + def score_pairs(self, pairs): + """Returns the score between pairs + (can be a similarity, or a distance/metric depending on the algorithm) + + Parameters + ---------- + pairs : `numpy.ndarray`, shape=(n_samples, [2,] n_features) + 3D array of pairs, or 2D array of one pair. + + Returns + ------- + scores: `numpy.ndarray` of shape=(n_pairs,) or scalar + The score of every pair. + """ + class MetricTransformer(TransformerMixin): @@ -35,7 +51,7 @@ def transform(self, X=None): return X.dot(L.T) -class MahalanobisMixin(six.with_metaclass(ABCMeta)): +class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner)): """Mahalanobis metric learning algorithms. Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`, @@ -50,10 +66,35 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta)): Attributes ---------- - transformer_: `np.ndarray`, shape=(n_features_out, n_features) + transformer_ : `np.ndarray`, shape=(n_features_out, n_features) The learned linear transformation ``L``. """ + def score_pairs(self, pairs): + """Returns the learned Mahalanobis distance between pairs. + + This distance is defined as: :math:`d_M(x, x') = \sqrt{(x-x')^T M (x-x')}` + where ``M`` is the learned Mahalanobis matrix, for every pair of points + ``x`` and ``x'``. This corresponds to the euclidean distance between + embeddings of the points in a new space, obtained through a linear + transformation. Indeed, we have also: :math:`d_M(x, x') = \sqrt{(x_e - + x_e')^T (x_e- x_e')}`, with :math:`x_e = L x` (See + :class:`MahalanobisMixin`). + + Parameters + ---------- + pairs : `numpy.ndarray`, shape=(n_samples, [2,] n_features) + 3D array of pairs, or 2D array of one pair. + + Returns + ------- + scores: `numpy.ndarray` of shape=(n_pairs,) or scalar + The learned Mahalanobis distance for every pair. + """ + pairwise_diffs = pairs[..., 1, :] - pairs[..., 0, :] + return np.sqrt(np.sum(pairwise_diffs.dot(self.metric()) * pairwise_diffs, + axis=-1)) + def metric(self): return self.transformer_.T.dot(self.transformer_) @@ -69,7 +110,7 @@ def transformer_from_metric(self, metric): return cholesky(metric).T -class _PairsClassifierMixin: +class _PairsClassifierMixin(BaseMetricLearner): def predict(self, pairs): """Predicts the learned metric between input pairs. @@ -87,9 +128,7 @@ def predict(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted learned metric value between samples in every pair. """ - pairwise_diffs = pairs[:, 0, :] - pairs[:, 1, :] - return np.sqrt(np.sum(pairwise_diffs.dot(self.metric()) * pairwise_diffs, - axis=1)) + return self.score_pairs(pairs) def decision_function(self, pairs): return self.predict(pairs) @@ -120,7 +159,7 @@ def score(self, pairs, y): return roc_auc_score(y, self.decision_function(pairs)) -class _QuadrupletsClassifierMixin: +class _QuadrupletsClassifierMixin(BaseMetricLearner): def predict(self, quadruplets): """Predicts differences between sample distances in input quadruplets. @@ -138,12 +177,8 @@ def predict(self, quadruplets): prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ - similar_diffs = quadruplets[:, 0, :] - quadruplets[:, 1, :] - dissimilar_diffs = quadruplets[:, 2, :] - quadruplets[:, 3, :] - return (np.sqrt(np.sum(similar_diffs.dot(self.metric()) * - similar_diffs, axis=1)) - - np.sqrt(np.sum(dissimilar_diffs.dot(self.metric()) * - dissimilar_diffs, axis=1))) + return (self.score_pairs(quadruplets[:, 0:1, :]) - + self.score_pairs(quadruplets[:, 2:3, :])) def decision_function(self, quadruplets): return self.predict(quadruplets) diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 0b6e862c..b474e35c 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -10,14 +10,12 @@ from __future__ import absolute_import import numpy as np -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_array -from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer) +from .base_metric import MahalanobisMixin, MetricTransformer -class Covariance(BaseMetricLearner, MetricTransformer, - MahalanobisMixin): +class Covariance(MetricTransformer, MahalanobisMixin): def __init__(self): pass diff --git a/metric_learn/itml.py b/metric_learn/itml.py index a068f6f9..0c447b9d 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -18,15 +18,13 @@ from six.moves import xrange from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_array, check_X_y -from sklearn.exceptions import NotFittedError - -from .base_metric import (BaseMetricLearner, _PairsClassifierMixin, - MetricTransformer, MahalanobisMixin) +from .base_metric import (_PairsClassifierMixin, MetricTransformer, + MahalanobisMixin) from .constraints import Constraints, wrap_pairs from ._util import vector_norm -class _BaseITML(BaseMetricLearner, MahalanobisMixin): +class _BaseITML(MahalanobisMixin): """Information Theoretic Metric Learning (ITML)""" def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, A0=None, verbose=False): diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 8d3a1e22..9f6fbf9f 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -16,14 +16,12 @@ import warnings from six.moves import xrange from sklearn.metrics import pairwise_distances -from sklearn.utils.validation import check_X_y, check_is_fitted +from sklearn.utils.validation import check_X_y -from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer) +from .base_metric import MahalanobisMixin, MetricTransformer -class LFDA(BaseMetricLearner, MahalanobisMixin, - MetricTransformer): +class LFDA(MahalanobisMixin, MetricTransformer): ''' Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction Sugiyama, ICML 2006 diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 2f9985cf..70b04be8 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -14,16 +14,14 @@ import warnings from collections import Counter from six.moves import xrange -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted +from sklearn.utils.validation import check_X_y, check_array from sklearn.metrics import euclidean_distances -from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer) +from .base_metric import MahalanobisMixin, MetricTransformer # commonality between LMNN implementations -class _base_LMNN(BaseMetricLearner, MahalanobisMixin, - MetricTransformer): +class _base_LMNN(MahalanobisMixin, MetricTransformer): def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, use_pca=True, verbose=False): diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 27b1704f..7b749150 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -11,15 +11,15 @@ import numpy as np import scipy.linalg from six.moves import xrange -from sklearn.exceptions import NotFittedError + from sklearn.utils.validation import check_array, check_X_y -from .base_metric import (BaseMetricLearner, _QuadrupletsClassifierMixin, - MetricTransformer, MahalanobisMixin) -from .constraints import Constraints, wrap_pairs +from .base_metric import (_QuadrupletsClassifierMixin, MetricTransformer, + MahalanobisMixin) +from .constraints import Constraints -class _BaseLSML(BaseMetricLearner, MahalanobisMixin): +class _BaseLSML(MahalanobisMixin): def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False): """Initialize LSML. diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index cebce9df..bfd3e47d 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -11,17 +11,15 @@ from scipy.optimize import minimize from scipy.spatial.distance import pdist, squareform from sklearn.decomposition import PCA -from sklearn.exceptions import NotFittedError -from sklearn.utils.validation import check_X_y, check_array -from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer) +from sklearn.utils.validation import check_X_y + +from .base_metric import MahalanobisMixin, MetricTransformer EPS = np.finfo(float).eps -class MLKR(BaseMetricLearner, MahalanobisMixin, - MetricTransformer): +class MLKR(MahalanobisMixin, MetricTransformer): """Metric Learning for Kernel Regression (MLKR)""" def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, max_iter=1000): diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 19562282..75de3d70 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -19,17 +19,16 @@ from __future__ import print_function, absolute_import, division import numpy as np from six.moves import xrange -from sklearn.exceptions import NotFittedError -from sklearn.metrics import pairwise_distances + from sklearn.utils.validation import check_array, check_X_y -from .base_metric import (BaseMetricLearner, _PairsClassifierMixin, - MahalanobisMixin, MetricTransformer) +from .base_metric import (_PairsClassifierMixin, MahalanobisMixin, + MetricTransformer) from .constraints import Constraints, wrap_pairs from ._util import vector_norm -class _BaseMMC(BaseMetricLearner, MahalanobisMixin): +class _BaseMMC(MahalanobisMixin): """Mahalanobis Metric for Clustering (MMC)""" def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, A0=None, diagonal=False, diagonal_c=1.0, verbose=False): diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 70c40fb9..72d4fcb7 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -6,16 +6,14 @@ from __future__ import absolute_import import numpy as np from six.moves import xrange -from sklearn.utils.validation import check_X_y, check_is_fitted +from sklearn.utils.validation import check_X_y -from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer) +from .base_metric import MahalanobisMixin, MetricTransformer EPS = np.finfo(float).eps -class NCA(BaseMetricLearner, MahalanobisMixin, - MetricTransformer): +class NCA(MahalanobisMixin, MetricTransformer): def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01): self.num_dims = num_dims self.max_iter = max_iter diff --git a/metric_learn/rca.py b/metric_learn/rca.py index a90a53dd..dedb8ded 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -16,10 +16,9 @@ import warnings from six.moves import xrange from sklearn import decomposition -from sklearn.utils.validation import check_array, check_is_fitted +from sklearn.utils.validation import check_array -from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer) +from .base_metric import MahalanobisMixin, MetricTransformer from .constraints import Constraints @@ -36,8 +35,7 @@ def _chunk_mean_centering(data, chunks): return chunk_mask, chunk_data -class RCA(BaseMetricLearner, MahalanobisMixin, - MetricTransformer): +class RCA(MahalanobisMixin, MetricTransformer): """Relevant Components Analysis (RCA)""" def __init__(self, num_dims=None, pca_comps=None): """Initialize the learner. diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index f72bf2e7..254f1d60 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -10,18 +10,16 @@ from __future__ import absolute_import import numpy as np -from scipy.sparse.csgraph import laplacian from sklearn.covariance import graph_lasso from sklearn.utils.extmath import pinvh -from sklearn.utils.validation import (check_array, check_X_y, - check_is_fitted) +from sklearn.utils.validation import check_array, check_X_y -from .base_metric import (BaseMetricLearner, MahalanobisMixin, - MetricTransformer, _PairsClassifierMixin) +from .base_metric import (MahalanobisMixin, MetricTransformer, + _PairsClassifierMixin) from .constraints import Constraints, wrap_pairs -class _BaseSDML(BaseMetricLearner, MahalanobisMixin): +class _BaseSDML(MahalanobisMixin): def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, verbose=False): """ diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py new file mode 100644 index 00000000..8dbeeabe --- /dev/null +++ b/test/test_mahalanobis_mixin.py @@ -0,0 +1,127 @@ +from itertools import product + +import pytest +import numpy as np +from sklearn import clone +from sklearn.datasets import load_iris +from sklearn.utils import check_random_state, shuffle + +from metric_learn import (Constraints, ITML, LSML, MMC, SDML, Covariance, LFDA, + LMNN, MLKR, NCA, RCA) +from metric_learn.constraints import wrap_pairs +from functools import partial + + +def build_data(): + RNG = check_random_state(0) + dataset = load_iris() + X, y = shuffle(dataset.data, dataset.target, random_state=RNG) + num_constraints = 20 + constraints = Constraints.random_subset(y) + pairs = constraints.positive_negative_pairs(num_constraints, + same_length=True, + random_state=RNG) + return X, pairs + + +def build_pairs(): + # test that you can do cross validation on tuples of points with + # a WeaklySupervisedMetricLearner + X, pairs = build_data() + pairs, y = wrap_pairs(X, pairs) + pairs, y = shuffle(pairs, y) + return (pairs, y) + + +def build_quadruplets(): + # test that you can do cross validation on a tuples of points with + # a WeaklySupervisedMetricLearner + X, pairs = build_data() + c = np.column_stack(pairs) + quadruplets = X[c] + quadruplets = shuffle(quadruplets) + return (quadruplets, None) + + +list_estimators = [(Covariance(), build_data), + (ITML(), build_pairs), + (LFDA(), partial(load_iris, return_X_y=True)), + (LMNN(), partial(load_iris, return_X_y=True)), + (LSML(), build_quadruplets), + (MLKR(), partial(load_iris, return_X_y=True)), + (MMC(), build_pairs), + (NCA(), partial(load_iris, return_X_y=True)), + (RCA(), partial(load_iris, return_X_y=True)), + (SDML(), build_pairs) + ] + +ids_estimators = ['covariance', + 'itml', + 'lfda', + 'lmnn', + 'lsml', + 'mlkr', + 'mmc', + 'nca', + 'rca', + 'sdml', + ] + + +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_score_matrix(estimator, build_dataset): + # Computing pairwise scores should return an euclidean distance matrix. + inputs, labels = build_dataset() + X, _ = load_iris(return_X_y=True) + n_samples = 20 + X = X[:n_samples] + model = clone(estimator) + model.fit(inputs, labels) + + pairwise = model.score_pairs(np.array(list(product(X, X))))\ + .reshape(n_samples, n_samples) + + check_is_distance_matrix(pairwise) + + # a necessary condition for euclidean distances matrix: (see + # https://en.wikipedia.org/wiki/Euclidean_distance_matrix) + assert np.linalg.matrix_rank(pairwise**2) <= min(X.shape) + 2 + + +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_score_finite(estimator, build_dataset): + # tests that the score is finite + inputs, labels = build_dataset() + model = clone(estimator) + model.fit(inputs, labels) + X, _ = load_iris(return_X_y=True) + pairs = np.array(list(product(X, X))) + assert np.isfinite(model.score_pairs(pairs)).all() + + +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def tests_score_dim(estimator, build_dataset): + # scoring of 3D arrays should return 1D array (several pairs), + # and scoring of 2D arrays (one pair) should return a scalar (0D array). + inputs, labels = build_dataset() + model = clone(estimator) + model.fit(inputs, labels) + X, _ = load_iris(return_X_y=True) + pairs = np.array(list(product(X, X))) + assert model.score_pairs(pairs).shape == (pairs.shape[0],) + assert np.isscalar(model.score_pairs(pairs[1])) + + +def check_is_distance_matrix(pairwise): + assert (pairwise >= 0).all() # positivity + assert (pairwise == pairwise.T).all() # symmetry + assert (pairwise.diagonal() == 0).all() # identity + # triangular inequality + for i in range(pairwise.shape[1]): + for j in range(pairwise.shape[1]): + for k in range(pairwise.shape[1]): + assert (pairwise[i, j] - (pairwise[i, k] + pairwise[k, j]) <= 0 + + 1e-3).all() From 35ece36e18661200acc6dc33b31e7c9d3f8d5245 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 26 Jun 2018 17:30:07 +0200 Subject: [PATCH 011/120] TST add test on toy example for score_pairs --- test/test_mahalanobis_mixin.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 8dbeeabe..2e5d5fbd 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -70,7 +70,7 @@ def build_quadruplets(): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_score_matrix(estimator, build_dataset): +def test_score_pairwise(estimator, build_dataset): # Computing pairwise scores should return an euclidean distance matrix. inputs, labels = build_dataset() X, _ = load_iris(return_X_y=True) @@ -89,6 +89,24 @@ def test_score_matrix(estimator, build_dataset): assert np.linalg.matrix_rank(pairwise**2) <= min(X.shape) + 2 +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_score_toy_example(estimator, build_dataset): + # Checks that score_pairs works on a toy example + inputs, labels = build_dataset() + X, _ = load_iris(return_X_y=True) + n_samples = 20 + X = X[:n_samples] + model = clone(estimator) + model.fit(inputs, labels) + pairs = np.stack([X[:10], X[10:20]], axis=1) + embedded_pairs = pairs.dot(model.transformer_.T) + distances = np.sqrt(np.sum((embedded_pairs[:, 1] - + embedded_pairs[:, 0])**2, + axis=1)) + np.array_equal(model.score_pairs(pairs), distances) + + @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_score_finite(estimator, build_dataset): @@ -117,7 +135,7 @@ def tests_score_dim(estimator, build_dataset): def check_is_distance_matrix(pairwise): assert (pairwise >= 0).all() # positivity - assert (pairwise == pairwise.T).all() # symmetry + assert np.array_equal(pairwise, pairwise.T) # symmetry assert (pairwise.diagonal() == 0).all() # identity # triangular inequality for i in range(pairwise.shape[1]): From dca6838ac7961733a288de9d7777438bb64069ee Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 27 Jun 2018 09:07:59 +0200 Subject: [PATCH 012/120] ENH Add embed function - add the function and docstring - use it for score_pairs - TST : - should be finite - have right output dimension - embedding should be linear - should work on a toy example --- metric_learn/base_metric.py | 26 ++++++++++++-- test/test_mahalanobis_mixin.py | 62 ++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 5 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index e9bc1753..a099bca8 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -91,9 +91,29 @@ def score_pairs(self, pairs): scores: `numpy.ndarray` of shape=(n_pairs,) or scalar The learned Mahalanobis distance for every pair. """ - pairwise_diffs = pairs[..., 1, :] - pairs[..., 0, :] - return np.sqrt(np.sum(pairwise_diffs.dot(self.metric()) * pairwise_diffs, - axis=-1)) + pairwise_diffs = self.embed(pairs[..., 1, :] - pairs[..., 0, :]) # (for + # MahalanobisMixin, the embedding is linear so we can just embed the + # difference) + return np.sqrt(np.sum(pairwise_diffs**2, axis=-1)) + + def embed(self, X): + """Embeds data points in the learned linear embedding space. + + Transforms samples in ``X`` into ``X_embedded``, samples inside a new + embedding space such that: ``X_embedded = X.dot(L.T)``, where ``L`` is + the learned linear transformation (See :class:`MahalanobisMixin`). + + Parameters + ---------- + X : `numpy.ndarray`, shape=(n_samples, n_features) + The data points to embed. + + Returns + ------- + X_embedded : `numpy.ndarray`, shape=(n_samples, n_features_out) + The embedded data points. + """ + return X.dot(self.transformer_.T) def metric(self): return self.transformer_.T.dot(self.transformer_) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 2e5d5fbd..a24eed61 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -2,6 +2,7 @@ import pytest import numpy as np +from numpy.testing import assert_array_almost_equal from sklearn import clone from sklearn.datasets import load_iris from sklearn.utils import check_random_state, shuffle @@ -103,8 +104,8 @@ def test_score_toy_example(estimator, build_dataset): embedded_pairs = pairs.dot(model.transformer_.T) distances = np.sqrt(np.sum((embedded_pairs[:, 1] - embedded_pairs[:, 0])**2, - axis=1)) - np.array_equal(model.score_pairs(pairs), distances) + axis=-1)) + assert_array_almost_equal(model.score_pairs(pairs), distances) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, @@ -143,3 +144,60 @@ def check_is_distance_matrix(pairwise): for k in range(pairwise.shape[1]): assert (pairwise[i, j] - (pairwise[i, k] + pairwise[k, j]) <= 0 + 1e-3).all() + + +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_embed_toy_example(estimator, build_dataset): + # Checks that embed works on a toy example + inputs, labels = build_dataset() + X, _ = load_iris(return_X_y=True) + n_samples = 20 + X = X[:n_samples] + model = clone(estimator) + model.fit(inputs, labels) + embedded_points = X.dot(model.transformer_.T) + assert_array_almost_equal(model.embed(X), embedded_points) + + +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def tests_embed_dim(estimator, build_dataset): + # Checks that the the dimension of the output space is as expected + inputs, labels = build_dataset() + model = clone(estimator) + model.fit(inputs, labels) + X, _ = load_iris(return_X_y=True) + assert model.embed(X).shape == X.shape + assert model.embed(X[0, :]).shape == (len(X[0]),) + # we test that the shape is also OK when doing dimensionality reduction + if type(model).__name__ in {'LFDA', 'MLKR', 'NCA', 'RCA'}: + model.set_params(num_dims=2) + model.fit(inputs, labels) + assert model.embed(X).shape == (X.shape[0], 2) + assert model.embed(X[0, :]).shape == (2,) + + +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_embed_finite(estimator, build_dataset): + # Checks that embed returns vectors with finite values + inputs, labels = build_dataset() + model = clone(estimator) + model.fit(inputs, labels) + X, _ = load_iris(return_X_y=True) + assert np.isfinite(model.embed(X)).all() + + +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_embed_is_linear(estimator, build_dataset): + # Checks that the embedding is linear + inputs, labels = build_dataset() + model = clone(estimator) + model.fit(inputs, labels) + X, _ = load_iris(return_X_y=True) + assert_array_almost_equal(model.embed(X[:10] + X[10:20]), + model.embed(X[:10]) + model.embed(X[10:20])) + assert_array_almost_equal(model.embed(5 * X[:10]), + 5 * model.embed(X[:10])) From 3254ce3340f8826febac548220850f07375d82d3 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 27 Jun 2018 11:57:55 +0200 Subject: [PATCH 013/120] FIX fix error in slicing of quadruplets --- metric_learn/base_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index a099bca8..ba6f8806 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -197,8 +197,8 @@ def predict(self, quadruplets): prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ - return (self.score_pairs(quadruplets[:, 0:1, :]) - - self.score_pairs(quadruplets[:, 2:3, :])) + return (self.score_pairs(quadruplets[..., :2, :]) - + self.score_pairs(quadruplets[..., 2:, :])) def decision_function(self, quadruplets): return self.predict(quadruplets) From e209b213609bd99723a307bc745b1938f8f14c26 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 27 Jun 2018 14:14:04 +0200 Subject: [PATCH 014/120] FIX minor corrections --- metric_learn/base_metric.py | 4 ++-- test/test_mahalanobis_mixin.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index ba6f8806..2b59babb 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -83,12 +83,12 @@ def score_pairs(self, pairs): Parameters ---------- - pairs : `numpy.ndarray`, shape=(n_samples, [2,] n_features) + pairs : `numpy.ndarray`, shape=(n_samples, 2, n_features) 3D array of pairs, or 2D array of one pair. Returns ------- - scores: `numpy.ndarray` of shape=(n_pairs,) or scalar + scores: `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. """ pairwise_diffs = self.embed(pairs[..., 1, :] - pairs[..., 0, :]) # (for diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index a24eed61..937494ce 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -71,7 +71,7 @@ def build_quadruplets(): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_score_pairwise(estimator, build_dataset): +def test_score_pairs_pairwise(estimator, build_dataset): # Computing pairwise scores should return an euclidean distance matrix. inputs, labels = build_dataset() X, _ = load_iris(return_X_y=True) @@ -92,7 +92,7 @@ def test_score_pairwise(estimator, build_dataset): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_score_toy_example(estimator, build_dataset): +def test_score_pairs_toy_example(estimator, build_dataset): # Checks that score_pairs works on a toy example inputs, labels = build_dataset() X, _ = load_iris(return_X_y=True) @@ -110,7 +110,7 @@ def test_score_toy_example(estimator, build_dataset): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_score_finite(estimator, build_dataset): +def test_score_pairs_finite(estimator, build_dataset): # tests that the score is finite inputs, labels = build_dataset() model = clone(estimator) @@ -122,16 +122,16 @@ def test_score_finite(estimator, build_dataset): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def tests_score_dim(estimator, build_dataset): - # scoring of 3D arrays should return 1D array (several pairs), - # and scoring of 2D arrays (one pair) should return a scalar (0D array). +def tests_score_pairs_dim(estimator, build_dataset): + # scoring of 3D arrays should return 1D array (several tuples), + # and scoring of 2D arrays (one tuple) should return a scalar (0D array). inputs, labels = build_dataset() model = clone(estimator) model.fit(inputs, labels) X, _ = load_iris(return_X_y=True) - pairs = np.array(list(product(X, X))) - assert model.score_pairs(pairs).shape == (pairs.shape[0],) - assert np.isscalar(model.score_pairs(pairs[1])) + tuples = np.array(list(product(X, X))) + assert model.score_pairs(tuples).shape == (tuples.shape[0],) + assert np.isscalar(model.score_pairs(tuples[1])) def check_is_distance_matrix(pairwise): From abea7de29112a68685203072b71cc0d5a6c6ebd3 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 27 Jun 2018 14:22:13 +0200 Subject: [PATCH 015/120] FIX minor corrections - remove unusual s to test functions - remove redundant parenthesis --- test/test_mahalanobis_mixin.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 937494ce..7b1969f9 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -31,7 +31,7 @@ def build_pairs(): X, pairs = build_data() pairs, y = wrap_pairs(X, pairs) pairs, y = shuffle(pairs, y) - return (pairs, y) + return pairs, y def build_quadruplets(): @@ -41,7 +41,7 @@ def build_quadruplets(): c = np.column_stack(pairs) quadruplets = X[c] quadruplets = shuffle(quadruplets) - return (quadruplets, None) + return quadruplets, None list_estimators = [(Covariance(), build_data), @@ -122,7 +122,7 @@ def test_score_pairs_finite(estimator, build_dataset): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def tests_score_pairs_dim(estimator, build_dataset): +def test_score_pairs_dim(estimator, build_dataset): # scoring of 3D arrays should return 1D array (several tuples), # and scoring of 2D arrays (one tuple) should return a scalar (0D array). inputs, labels = build_dataset() @@ -162,7 +162,7 @@ def test_embed_toy_example(estimator, build_dataset): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def tests_embed_dim(estimator, build_dataset): +def test_embed_dim(estimator, build_dataset): # Checks that the the dimension of the output space is as expected inputs, labels = build_dataset() model = clone(estimator) From 65e794a11b59c5f776480beff3b61894ae8f98ad Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 27 Jun 2018 14:25:14 +0200 Subject: [PATCH 016/120] FIX fix PEP8 errors --- metric_learn/mmc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 75de3d70..7ed55f50 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -371,7 +371,7 @@ def transformer_from_metric(self, metric): return np.sqrt(metric) else: w, V = np.linalg.eigh(metric) - return V.T * np.sqrt(np.maximum(0, w[:,None])) + return V.T * np.sqrt(np.maximum(0, w[:, None])) class MMC(_BaseMMC, _PairsClassifierMixin): From 12b54292b7076662982b28e2fd6e737c2277544b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 27 Jun 2018 14:28:30 +0200 Subject: [PATCH 017/120] FIX remove possible one-sample scoring from docstring for now --- metric_learn/base_metric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 2b59babb..fa4754ad 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -18,12 +18,12 @@ def score_pairs(self, pairs): Parameters ---------- - pairs : `numpy.ndarray`, shape=(n_samples, [2,] n_features) - 3D array of pairs, or 2D array of one pair. + pairs : `numpy.ndarray`, shape=(n_samples, 2, n_features) + 3D array of pairs. Returns ------- - scores: `numpy.ndarray` of shape=(n_pairs,) or scalar + scores: `numpy.ndarray` of shape=(n_pairs,) The score of every pair. """ From eff278e007dded4a6f5ee2ead3ae92bb8c720eb6 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 27 Jun 2018 14:31:33 +0200 Subject: [PATCH 018/120] REF rename n_features_out to num_dims to be more coherent with current algorithms --- metric_learn/base_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index fa4754ad..715eb378 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -66,7 +66,7 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner)): Attributes ---------- - transformer_ : `np.ndarray`, shape=(n_features_out, n_features) + transformer_ : `np.ndarray`, shape=(num_dims, n_features) The learned linear transformation ``L``. """ @@ -110,7 +110,7 @@ def embed(self, X): Returns ------- - X_embedded : `numpy.ndarray`, shape=(n_samples, n_features_out) + X_embedded : `numpy.ndarray`, shape=(n_samples, num_dims) The embedded data points. """ return X.dot(self.transformer_.T) From 810d191d9ca11744c5c9263a684cad1206f6025e Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 24 Jul 2018 10:11:26 +0200 Subject: [PATCH 019/120] MAINT: Adress https://github.com/metric-learn/metric-learn/pull/96#pullrequestreview-133939700 - replace embed by transform and add always the input X in calling the function - mutualize _transformer_from_metric not to be overwritten in MMC - improve test_mahalanobis_mixin.test_score_pairs_pairwise according to https://github.com/metric-learn/metric-learn/pull/96#discussion_r199762780 - improve test_mahalanobis_mixin.check_is_distance_matrix - correct typos and nitpicks --- examples/sandwich.py | 2 +- metric_learn/base_metric.py | 72 +++++++++++++++++++++++----------- metric_learn/covariance.py | 7 ++-- metric_learn/itml.py | 10 ++--- metric_learn/lfda.py | 6 +-- metric_learn/lmnn.py | 10 ++--- metric_learn/lsml.py | 9 ++--- metric_learn/mlkr.py | 5 ++- metric_learn/mmc.py | 29 +++----------- metric_learn/nca.py | 5 ++- metric_learn/rca.py | 5 ++- metric_learn/sdml.py | 8 ++-- test/metric_learn_test.py | 17 ++++---- test/test_fit_transform.py | 12 +++--- test/test_mahalanobis_mixin.py | 36 ++++++++++------- 15 files changed, 125 insertions(+), 108 deletions(-) diff --git a/examples/sandwich.py b/examples/sandwich.py index 34b48a00..08ec17c5 100644 --- a/examples/sandwich.py +++ b/examples/sandwich.py @@ -30,7 +30,7 @@ def sandwich_demo(): for ax_num, ml in enumerate(mls, start=3): ml.fit(x, y) - tx = ml.transform() + tx = ml.transform(x) ml_knn = nearest_neighbors(tx, k=2) ax = plt.subplot(3, 2, ax_num) plot_sandwich_data(tx, y, axis=ax) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 715eb378..ab95b066 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -1,5 +1,5 @@ from numpy.linalg import cholesky -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import BaseEstimator from sklearn.utils.validation import check_array from sklearn.metrics import roc_auc_score import numpy as np @@ -28,9 +28,9 @@ def score_pairs(self, pairs): """ -class MetricTransformer(TransformerMixin): +class MetricTransformer(): - def transform(self, X=None): + def transform(self, X): """Applies the metric transformation. Parameters @@ -43,15 +43,10 @@ def transform(self, X=None): transformed : (n x d) matrix Input data transformed to the metric space by :math:`XL^{\\top}` """ - if X is None: - X = self.X_ - else: - X = check_array(X, accept_sparse=True) - L = self.transformer_ - return X.dot(L.T) -class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner)): +class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner, + MetricTransformer)): """Mahalanobis metric learning algorithms. Algorithm that learns a Mahalanobis (pseudo) distance :math:`d_M(x, x')`, @@ -91,12 +86,12 @@ def score_pairs(self, pairs): scores: `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. """ - pairwise_diffs = self.embed(pairs[..., 1, :] - pairs[..., 0, :]) # (for - # MahalanobisMixin, the embedding is linear so we can just embed the + pairwise_diffs = self.transform(pairs[..., 1, :] - pairs[..., 0, :]) + # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) return np.sqrt(np.sum(pairwise_diffs**2, axis=-1)) - def embed(self, X): + def transform(self, X): """Embeds data points in the learned linear embedding space. Transforms samples in ``X`` into ``X_embedded``, samples inside a new @@ -113,21 +108,37 @@ def embed(self, X): X_embedded : `numpy.ndarray`, shape=(n_samples, num_dims) The embedded data points. """ - return X.dot(self.transformer_.T) + X_checked = check_array(X, accept_sparse=True, ensure_2d=False) + return X_checked.dot(self.transformer_.T) def metric(self): return self.transformer_.T.dot(self.transformer_) - def transformer_from_metric(self, metric): + def _transformer_from_metric(self, metric): """Computes the transformation matrix from the Mahalanobis matrix. - L = cholesky(M).T + Since by definition the metric `M` is positive semi-definite (PSD), it + admits a Cholesky decomposition: L = cholesky(M).T. However, currently the + computation of the Cholesky decomposition used does not support + non-definite matrices. If the metric is not definite, this method will + return L = V.T w^( -1/2), with M = V*w*V.T being the eigenvector + decomposition of M with the eigenvalues in the diagonal matrix w and the + columns of V being the eigenvectors. If M is diagonal, this method will + just return its elementwise square root (since the diagonalization of + the matrix is itself). Returns ------- - L : upper triangular (d x d) matrix + L : (d x d) matrix """ - return cholesky(metric).T + + if np.allclose(metric, np.diag(np.diag(metric))): + return np.sqrt(metric) + elif not np.isclose(np.linalg.det(metric), 0): + return cholesky(metric).T + else: + w, V = np.linalg.eigh(metric) + return V.T * np.sqrt(np.maximum(0, w[:, None])) class _PairsClassifierMixin(BaseMetricLearner): @@ -182,6 +193,24 @@ def score(self, pairs, y): class _QuadrupletsClassifierMixin(BaseMetricLearner): def predict(self, quadruplets): + """Predicts the ordering between sample distances in input quadruplets. + + For each quadruplet, returns 1 if the quadruplet is in the right order ( + first pair is more similar than second pair), and -1 if not. + + Parameters + ---------- + quadruplets : array-like, shape=(n_constraints, 4, n_features) + Input quadruplets. + + Returns + ------- + prediction : `numpy.ndarray` of floats, shape=(n_constraints,) + Predictions of the ordering of pairs, for each quadruplet. + """ + return np.sign(self.decision_function(quadruplets)) + + def decision_function(self, quadruplets): """Predicts differences between sample distances in input quadruplets. For each quadruplet of samples, computes the difference between the learned @@ -194,15 +223,12 @@ def predict(self, quadruplets): Returns ------- - prediction : `numpy.ndarray` of floats, shape=(n_constraints,) + decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ return (self.score_pairs(quadruplets[..., :2, :]) - self.score_pairs(quadruplets[..., 2:, :])) - def decision_function(self, quadruplets): - return self.predict(quadruplets) - def score(self, quadruplets, y=None): """Computes score on input quadruplets @@ -222,4 +248,4 @@ def score(self, quadruplets, y=None): score : float The quadruplets score. """ - return - np.mean(np.sign(self.decision_function(quadruplets))) + return - np.mean(self.predict(quadruplets)) diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index b474e35c..eeee7d80 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -11,11 +11,12 @@ from __future__ import absolute_import import numpy as np from sklearn.utils.validation import check_array +from sklearn.base import TransformerMixin -from .base_metric import MahalanobisMixin, MetricTransformer +from .base_metric import MahalanobisMixin -class Covariance(MetricTransformer, MahalanobisMixin): +class Covariance(MahalanobisMixin, TransformerMixin): def __init__(self): pass @@ -31,5 +32,5 @@ def fit(self, X, y=None): else: self.M_ = np.linalg.inv(self.M_) - self.transformer_ = self.transformer_from_metric(check_array(self.M_)) + self.transformer_ = self._transformer_from_metric(check_array(self.M_)) return self diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 0c447b9d..ab256346 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -18,8 +18,8 @@ from six.moves import xrange from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_array, check_X_y -from .base_metric import (_PairsClassifierMixin, MetricTransformer, - MahalanobisMixin) +from sklearn.base import TransformerMixin +from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs from ._util import vector_norm @@ -53,7 +53,7 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, def _process_pairs(self, pairs, y, bounds): pairs, y = check_X_y(pairs, y, accept_sparse=False, - ensure_2d=False, allow_nd=True) + ensure_2d=False, allow_nd=True) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -129,7 +129,7 @@ def _fit(self, pairs, y, bounds=None): print('itml converged at iter: %d, conv = %f' % (it, conv)) self.n_iter_ = it - self.transformer_ = self.transformer_from_metric(self.A_) + self.transformer_ = self._transformer_from_metric(self.A_) return self @@ -155,7 +155,7 @@ def fit(self, pairs, y, bounds=None): return self._fit(pairs, y, bounds=bounds) -class ITML_Supervised(_BaseITML, MetricTransformer): +class ITML_Supervised(_BaseITML, TransformerMixin): """Information Theoretic Metric Learning (ITML)""" def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, num_labeled=np.inf, num_constraints=None, bounds=None, A0=None, diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 9f6fbf9f..8c693712 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -17,11 +17,11 @@ from six.moves import xrange from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_X_y +from sklearn.base import TransformerMixin +from .base_metric import MahalanobisMixin -from .base_metric import MahalanobisMixin, MetricTransformer - -class LFDA(MahalanobisMixin, MetricTransformer): +class LFDA(MahalanobisMixin, TransformerMixin): ''' Local Fisher Discriminant Analysis for Supervised Dimensionality Reduction Sugiyama, ICML 2006 diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 70b04be8..0071ce50 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -16,12 +16,12 @@ from six.moves import xrange from sklearn.utils.validation import check_X_y, check_array from sklearn.metrics import euclidean_distances - -from .base_metric import MahalanobisMixin, MetricTransformer +from sklearn.base import TransformerMixin +from .base_metric import MahalanobisMixin # commonality between LMNN implementations -class _base_LMNN(MahalanobisMixin, MetricTransformer): +class _base_LMNN(MahalanobisMixin, TransformerMixin): def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, use_pca=True, verbose=False): @@ -189,7 +189,7 @@ def _select_targets(self): return target_neighbors def _find_impostors(self, furthest_neighbors): - Lx = self.transform() + Lx = self.transform(self.X_) margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx) impostors = [] for label in self.labels_[:-1]: @@ -256,7 +256,7 @@ def fit(self, X, y): self._lmnn.train() else: self._lmnn.train(np.eye(X.shape[1])) - self.L_ = self._lmnn.get_linear_transform() + self.L_ = self._lmnn.get_linear_transform(X) return self except ImportError: diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 7b749150..eb8171f9 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -11,11 +11,10 @@ import numpy as np import scipy.linalg from six.moves import xrange - +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_array, check_X_y -from .base_metric import (_QuadrupletsClassifierMixin, MetricTransformer, - MahalanobisMixin) +from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin from .constraints import Constraints @@ -95,7 +94,7 @@ def _fit(self, quadruplets, weights=None): print("Didn't converge after", it, "iterations. Final loss:", s_best) self.n_iter_ = it - self.transformer_ = self.transformer_from_metric(self.M_) + self.transformer_ = self._transformer_from_metric(self.M_) return self def _comparison_loss(self, metric): @@ -147,7 +146,7 @@ def fit(self, quadruplets, weights=None): return self._fit(quadruplets, weights=weights) -class LSML_Supervised(_BaseLSML, MetricTransformer): +class LSML_Supervised(_BaseLSML, TransformerMixin): def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf, num_constraints=None, weights=None, verbose=False): """Initialize the learner. diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index bfd3e47d..45aacf84 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -10,16 +10,17 @@ import numpy as np from scipy.optimize import minimize from scipy.spatial.distance import pdist, squareform +from sklearn.base import TransformerMixin from sklearn.decomposition import PCA from sklearn.utils.validation import check_X_y -from .base_metric import MahalanobisMixin, MetricTransformer +from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps -class MLKR(MahalanobisMixin, MetricTransformer): +class MLKR(MahalanobisMixin, TransformerMixin): """Metric Learning for Kernel Regression (MLKR)""" def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, max_iter=1000): diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 7ed55f50..025bded5 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -19,11 +19,10 @@ from __future__ import print_function, absolute_import, division import numpy as np from six.moves import xrange - +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_array, check_X_y -from .base_metric import (_PairsClassifierMixin, MahalanobisMixin, - MetricTransformer) +from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs from ._util import vector_norm @@ -215,7 +214,7 @@ def _fit_full(self, pairs, y): self.A_[:] = A_old self.n_iter_ = cycle - self.transformer_ = self.transformer_from_metric(self.A_) + self.transformer_ = self._transformer_from_metric(self.A_) return self def _fit_diag(self, pairs, y): @@ -275,7 +274,7 @@ def _fit_diag(self, pairs, y): self.A_ = np.diag(w) - self.transformer_ = self.transformer_from_metric(self.A_) + self.transformer_ = self._transformer_from_metric(self.A_) return self def _fD(self, neg_pairs, A): @@ -355,24 +354,6 @@ def _D_constraint(self, neg_pairs, w): sum_deri2 / sum_dist - np.outer(sum_deri1, sum_deri1) / (sum_dist * sum_dist) ) - def transformer_from_metric(self, metric): - """Computes the transformation matrix from the Mahalanobis matrix. - L = V.T * w^(-1/2), with A = V*w*V.T being the eigenvector decomposition of A with - the eigenvalues in the diagonal matrix w and the columns of V being the eigenvectors. - - The Cholesky decomposition cannot be applied here, since MMC learns only a positive - *semi*-definite Mahalanobis matrix. - - Returns - ------- - L : (d x d) matrix - """ - if self.diagonal: - return np.sqrt(metric) - else: - w, V = np.linalg.eigh(metric) - return V.T * np.sqrt(np.maximum(0, w[:, None])) - class MMC(_BaseMMC, _PairsClassifierMixin): @@ -394,7 +375,7 @@ def fit(self, pairs, y): return self._fit(pairs, y) -class MMC_Supervised(_BaseMMC, MetricTransformer): +class MMC_Supervised(_BaseMMC, TransformerMixin): """Mahalanobis Metric for Clustering (MMC)""" def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6, num_labeled=np.inf, num_constraints=None, diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 72d4fcb7..4eadca6b 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -6,14 +6,15 @@ from __future__ import absolute_import import numpy as np from six.moves import xrange +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_X_y -from .base_metric import MahalanobisMixin, MetricTransformer +from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps -class NCA(MahalanobisMixin, MetricTransformer): +class NCA(MahalanobisMixin, TransformerMixin): def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01): self.num_dims = num_dims self.max_iter = max_iter diff --git a/metric_learn/rca.py b/metric_learn/rca.py index dedb8ded..2fbf70a2 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -16,9 +16,10 @@ import warnings from six.moves import xrange from sklearn import decomposition +from sklearn.base import TransformerMixin from sklearn.utils.validation import check_array -from .base_metric import MahalanobisMixin, MetricTransformer +from .base_metric import MahalanobisMixin from .constraints import Constraints @@ -35,7 +36,7 @@ def _chunk_mean_centering(data, chunks): return chunk_mask, chunk_data -class RCA(MahalanobisMixin, MetricTransformer): +class RCA(MahalanobisMixin, TransformerMixin): """Relevant Components Analysis (RCA)""" def __init__(self, num_dims=None, pca_comps=None): """Initialize the learner. diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 254f1d60..d1eae770 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -10,12 +10,12 @@ from __future__ import absolute_import import numpy as np +from sklearn.base import TransformerMixin from sklearn.covariance import graph_lasso from sklearn.utils.extmath import pinvh from sklearn.utils.validation import check_array, check_X_y -from .base_metric import (MahalanobisMixin, MetricTransformer, - _PairsClassifierMixin) +from .base_metric import MahalanobisMixin, _PairsClassifierMixin from .constraints import Constraints, wrap_pairs @@ -62,7 +62,7 @@ def _fit(self, pairs, y): emp_cov = emp_cov.T.dot(emp_cov) _, self.M_ = graph_lasso(emp_cov, self.sparsity_param, verbose=self.verbose) - self.transformer_ = self.transformer_from_metric(self.M_) + self.transformer_ = self._transformer_from_metric(self.M_) return self @@ -86,7 +86,7 @@ def fit(self, pairs, y): return self._fit(pairs, y) -class SDML_Supervised(_BaseSDML, MetricTransformer): +class SDML_Supervised(_BaseSDML, TransformerMixin): def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, num_labeled=np.inf, num_constraints=None, verbose=False): """ diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 2f552902..1671c8ef 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -38,7 +38,7 @@ def test_iris(self): cov = Covariance() cov.fit(self.iris_points) - csep = class_separation(cov.transform(), self.iris_labels) + csep = class_separation(cov.transform(self.iris_points), self.iris_labels) # deterministic result self.assertAlmostEqual(csep, 0.73068122) @@ -68,7 +68,8 @@ def test_iris(self): lmnn = LMNN_cls(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.iris_points, self.iris_labels) - csep = class_separation(lmnn.transform(), self.iris_labels) + csep = class_separation(lmnn.transform(self.iris_points), + self.iris_labels) self.assertLess(csep, 0.25) @@ -102,7 +103,7 @@ def test_iris(self): # With dimension reduction nca = NCA(max_iter=(100000//n), learning_rate=0.01, num_dims=2) nca.fit(self.iris_points, self.iris_labels) - csep = class_separation(nca.transform(), self.iris_labels) + csep = class_separation(nca.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.15) @@ -110,7 +111,7 @@ class TestLFDA(MetricTestCase): def test_iris(self): lfda = LFDA(k=2, num_dims=2) lfda.fit(self.iris_points, self.iris_labels) - csep = class_separation(lfda.transform(), self.iris_labels) + csep = class_separation(lfda.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.15) # Sanity checks for learned matrices. @@ -122,7 +123,7 @@ class TestRCA(MetricTestCase): def test_iris(self): rca = RCA_Supervised(num_dims=2, num_chunks=30, chunk_size=2) rca.fit(self.iris_points, self.iris_labels) - csep = class_separation(rca.transform(), self.iris_labels) + csep = class_separation(rca.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.25) def test_feature_null_variance(self): @@ -131,14 +132,14 @@ def test_feature_null_variance(self): # Apply PCA with the number of components rca = RCA_Supervised(num_dims=2, pca_comps=3, num_chunks=30, chunk_size=2) rca.fit(X, self.iris_labels) - csep = class_separation(rca.transform(), self.iris_labels) + csep = class_separation(rca.transform(X), self.iris_labels) self.assertLess(csep, 0.30) # Apply PCA with the minimum variance ratio rca = RCA_Supervised(num_dims=2, pca_comps=0.95, num_chunks=30, chunk_size=2) rca.fit(X, self.iris_labels) - csep = class_separation(rca.transform(), self.iris_labels) + csep = class_separation(rca.transform(X), self.iris_labels) self.assertLess(csep, 0.30) @@ -146,7 +147,7 @@ class TestMLKR(MetricTestCase): def test_iris(self): mlkr = MLKR() mlkr.fit(self.iris_points, self.iris_labels) - csep = class_separation(mlkr.transform(), self.iris_labels) + csep = class_separation(mlkr.transform(self.iris_points), self.iris_labels) self.assertLess(csep, 0.25) diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py index d239ec95..f898a0fe 100644 --- a/test/test_fit_transform.py +++ b/test/test_fit_transform.py @@ -19,7 +19,7 @@ def setUpClass(self): def test_cov(self): cov = Covariance() cov.fit(self.X) - res_1 = cov.transform() + res_1 = cov.transform(self.X) cov = Covariance() res_2 = cov.fit_transform(self.X) @@ -53,7 +53,7 @@ def test_itml_supervised(self): def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) - res_1 = lmnn.transform() + res_1 = lmnn.transform(self.X) lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) res_2 = lmnn.fit_transform(self.X, self.y) @@ -76,7 +76,7 @@ def test_nca(self): n = self.X.shape[0] nca = NCA(max_iter=(100000//n), learning_rate=0.01) nca.fit(self.X, self.y) - res_1 = nca.transform() + res_1 = nca.transform(self.X) nca = NCA(max_iter=(100000//n), learning_rate=0.01) res_2 = nca.fit_transform(self.X, self.y) @@ -86,7 +86,7 @@ def test_nca(self): def test_lfda(self): lfda = LFDA(k=2, num_dims=2) lfda.fit(self.X, self.y) - res_1 = lfda.transform() + res_1 = lfda.transform(self.X) lfda = LFDA(k=2, num_dims=2) res_2 = lfda.fit_transform(self.X, self.y) @@ -100,7 +100,7 @@ def test_rca_supervised(self): seed = np.random.RandomState(1234) rca = RCA_Supervised(num_dims=2, num_chunks=30, chunk_size=2) rca.fit(self.X, self.y, random_state=seed) - res_1 = rca.transform() + res_1 = rca.transform(self.X) seed = np.random.RandomState(1234) rca = RCA_Supervised(num_dims=2, num_chunks=30, chunk_size=2) @@ -111,7 +111,7 @@ def test_rca_supervised(self): def test_mlkr(self): mlkr = MLKR(num_dims=2) mlkr.fit(self.X, self.y) - res_1 = mlkr.transform() + res_1 = mlkr.transform(self.X) mlkr = MLKR(num_dims=2) res_2 = mlkr.fit_transform(self.X, self.y) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 7b1969f9..687000c0 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -3,6 +3,7 @@ import pytest import numpy as np from numpy.testing import assert_array_almost_equal +from scipy.spatial.distance import pdist, squareform from sklearn import clone from sklearn.datasets import load_iris from sklearn.utils import check_random_state, shuffle @@ -72,7 +73,7 @@ def build_quadruplets(): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_score_pairs_pairwise(estimator, build_dataset): - # Computing pairwise scores should return an euclidean distance matrix. + # Computing pairwise scores should return a euclidean distance matrix. inputs, labels = build_dataset() X, _ = load_iris(return_X_y=True) n_samples = 20 @@ -85,10 +86,13 @@ def test_score_pairs_pairwise(estimator, build_dataset): check_is_distance_matrix(pairwise) - # a necessary condition for euclidean distances matrix: (see + # a necessary condition for euclidean distance matrices: (see # https://en.wikipedia.org/wiki/Euclidean_distance_matrix) assert np.linalg.matrix_rank(pairwise**2) <= min(X.shape) + 2 + # assert that this distance is coherent with pdist on embeddings + assert_array_almost_equal(squareform(pairwise), pdist(model.transform(X))) + @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) @@ -140,7 +144,8 @@ def check_is_distance_matrix(pairwise): assert (pairwise.diagonal() == 0).all() # identity # triangular inequality for i in range(pairwise.shape[1]): - for j in range(pairwise.shape[1]): + # since we already checked symmetry we can start at i + for j in range(i, pairwise.shape[1]): for k in range(pairwise.shape[1]): assert (pairwise[i, j] - (pairwise[i, k] + pairwise[k, j]) <= 0 + 1e-3).all() @@ -157,7 +162,7 @@ def test_embed_toy_example(estimator, build_dataset): model = clone(estimator) model.fit(inputs, labels) embedded_points = X.dot(model.transformer_.T) - assert_array_almost_equal(model.embed(X), embedded_points) + assert_array_almost_equal(model.transform(X), embedded_points) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, @@ -168,14 +173,14 @@ def test_embed_dim(estimator, build_dataset): model = clone(estimator) model.fit(inputs, labels) X, _ = load_iris(return_X_y=True) - assert model.embed(X).shape == X.shape - assert model.embed(X[0, :]).shape == (len(X[0]),) + assert model.transform(X).shape == X.shape + assert model.transform(X[0, :]).shape == (len(X[0]),) # we test that the shape is also OK when doing dimensionality reduction if type(model).__name__ in {'LFDA', 'MLKR', 'NCA', 'RCA'}: - model.set_params(num_dims=2) - model.fit(inputs, labels) - assert model.embed(X).shape == (X.shape[0], 2) - assert model.embed(X[0, :]).shape == (2,) + model.set_params(num_dims=2) + model.fit(inputs, labels) + assert model.transform(X).shape == (X.shape[0], 2) + assert model.transform(X[0, :]).shape == (2,) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, @@ -186,7 +191,7 @@ def test_embed_finite(estimator, build_dataset): model = clone(estimator) model.fit(inputs, labels) X, _ = load_iris(return_X_y=True) - assert np.isfinite(model.embed(X)).all() + assert np.isfinite(model.transform(X)).all() @pytest.mark.parametrize('estimator, build_dataset', list_estimators, @@ -197,7 +202,8 @@ def test_embed_is_linear(estimator, build_dataset): model = clone(estimator) model.fit(inputs, labels) X, _ = load_iris(return_X_y=True) - assert_array_almost_equal(model.embed(X[:10] + X[10:20]), - model.embed(X[:10]) + model.embed(X[10:20])) - assert_array_almost_equal(model.embed(5 * X[:10]), - 5 * model.embed(X[:10])) + assert_array_almost_equal(model.transform(X[:10] + X[10:20]), + model.transform(X[:10]) + + model.transform(X[10:20])) + assert_array_almost_equal(model.transform(5 * X[:10]), + 5 * model.transform(X[:10])) From 585b5d2fda05d488b6458fb9b0aff93121f5bdf5 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 24 Jul 2018 16:08:07 +0200 Subject: [PATCH 020/120] ENH: Add check_tuples --- metric_learn/_util.py | 40 +++++++++++++++++++++++++++++++++- metric_learn/base_metric.py | 16 ++++++++++---- metric_learn/itml.py | 5 ++++- metric_learn/lsml.py | 11 +++++++--- metric_learn/mmc.py | 7 ++++-- metric_learn/sdml.py | 7 +++++- test/test_mahalanobis_mixin.py | 24 ++++++++++++++++---- test/test_utils.py | 29 ++++++++++++++++++++++++ test/test_weakly_supervised.py | 19 +++++++++++----- 9 files changed, 136 insertions(+), 22 deletions(-) create mode 100644 test/test_utils.py diff --git a/metric_learn/_util.py b/metric_learn/_util.py index b34860d6..6e3d9a49 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -9,4 +9,42 @@ def vector_norm(X): return np.apply_along_axis(np.linalg.norm, 1, X) else: def vector_norm(X): - return np.linalg.norm(X, axis=1) \ No newline at end of file + return np.linalg.norm(X, axis=1) + + +def check_tuples(tuples): + """Check that the input is a valid 3D array representing a dataset of tuples. + + Equivalent of `check_array` in scikit-learn. + + Parameters + ---------- + tuples : object + The tuples to check. + + Returns + ------- + tuples_valid : object + The validated input. + """ + # If input is scalar raise error + if len(tuples.shape) == 0: + raise ValueError( + "Expected 3D array, got scalar instead. Cannot apply this function on " + "scalars.") + # If input is 1D raise error + if len(tuples.shape) == 1: + raise ValueError( + "Expected 3D array, got 1D array instead:\ntuples={}.\n" + "Reshape your data using tuples.reshape(1, -1, 1) if it contains a " + "single tuple and the points in the tuple have a single " + "feature.").format(tuples) + # If input is 2D raise error + if len(tuples.shape) == 2: + raise ValueError( + "Expected 3D array, got 2D array instead:\ntuples={}.\n" + "Reshape your data either using tuples.reshape(-1, {}, 1) if " + "your data has a single feature or tuples.reshape(1, {}, -1) " + "if it contains a single tuple.".format(tuples, tuples.shape[1], + tuples.shape[0])) + return tuples diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index ab95b066..559e4520 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -5,6 +5,7 @@ import numpy as np from abc import ABCMeta, abstractmethod import six +from ._util import check_tuples class BaseMetricLearner(BaseEstimator): @@ -86,7 +87,8 @@ def score_pairs(self, pairs): scores: `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. """ - pairwise_diffs = self.transform(pairs[..., 1, :] - pairs[..., 0, :]) + pairs = check_tuples(pairs) + pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) return np.sqrt(np.sum(pairwise_diffs**2, axis=-1)) @@ -108,7 +110,7 @@ def transform(self, X): X_embedded : `numpy.ndarray`, shape=(n_samples, num_dims) The embedded data points. """ - X_checked = check_array(X, accept_sparse=True, ensure_2d=False) + X_checked = check_array(X, accept_sparse=True) return X_checked.dot(self.transformer_.T) def metric(self): @@ -159,9 +161,11 @@ def predict(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted learned metric value between samples in every pair. """ + pairs = check_tuples(pairs) return self.score_pairs(pairs) def decision_function(self, pairs): + pairs = check_tuples(pairs) return self.predict(pairs) def score(self, pairs, y): @@ -187,6 +191,7 @@ def score(self, pairs, y): score : float The ``roc_auc`` score. """ + pairs = check_tuples(pairs) return roc_auc_score(y, self.decision_function(pairs)) @@ -208,6 +213,7 @@ def predict(self, quadruplets): prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each quadruplet. """ + quadruplets = check_tuples(quadruplets) return np.sign(self.decision_function(quadruplets)) def decision_function(self, quadruplets): @@ -226,8 +232,9 @@ def decision_function(self, quadruplets): decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ - return (self.score_pairs(quadruplets[..., :2, :]) - - self.score_pairs(quadruplets[..., 2:, :])) + quadruplets = check_tuples(quadruplets) + return (self.score_pairs(quadruplets[:, :2, :]) - + self.score_pairs(quadruplets[:, 2:, :])) def score(self, quadruplets, y=None): """Computes score on input quadruplets @@ -248,4 +255,5 @@ def score(self, quadruplets, y=None): score : float The quadruplets score. """ + quadruplets = check_tuples(quadruplets) return - np.mean(self.predict(quadruplets)) diff --git a/metric_learn/itml.py b/metric_learn/itml.py index ab256346..05620499 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -21,7 +21,7 @@ from sklearn.base import TransformerMixin from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import vector_norm +from ._util import vector_norm, check_tuples class _BaseITML(MahalanobisMixin): @@ -52,8 +52,11 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, self.verbose = verbose def _process_pairs(self, pairs, y, bounds): + # for now we check_X_y and check_tuples but we should only + # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) + pairs = check_tuples(pairs) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index eb8171f9..0fdafd84 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -13,6 +13,7 @@ from six.moves import xrange from sklearn.base import TransformerMixin from sklearn.utils.validation import check_array, check_X_y +from ._util import check_tuples from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin from .constraints import Constraints @@ -37,8 +38,11 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False): self.verbose = verbose def _prepare_quadruplets(self, quadruplets, weights): - pairs = check_array(quadruplets, accept_sparse=False, - ensure_2d=False, allow_nd=True) + # for now we check_array and check_tuples but we should only + # check_tuples in the future (with enhanced check_tuples) + quadruplets = check_array(quadruplets, accept_sparse=False, + ensure_2d=False, allow_nd=True) + quadruplets = check_tuples(quadruplets) # check to make sure that no two constrained vectors are identical self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :] @@ -51,7 +55,8 @@ def _prepare_quadruplets(self, quadruplets, weights): self.w_ = weights self.w_ /= self.w_.sum() # weights must sum to 1 if self.prior is None: - X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])}) + X = np.vstack({tuple(row) for row in + quadruplets.reshape(-1, quadruplets.shape[2])}) self.prior_inv_ = np.atleast_2d(np.cov(X, rowvar=False)) self.M_ = np.linalg.inv(self.prior_inv_) else: diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 025bded5..921402ce 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -24,7 +24,7 @@ from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import vector_norm +from ._util import vector_norm, check_tuples class _BaseMMC(MahalanobisMixin): @@ -65,8 +65,11 @@ def _fit(self, pairs, y): return self._fit_full(pairs, y) def _process_pairs(self, pairs, y): + # for now we check_X_y and check_tuples but we should only + # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, - ensure_2d=False, allow_nd=True) + ensure_2d=False, allow_nd=True) + pairs = check_tuples(pairs) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index d1eae770..b91b6b27 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -17,6 +17,7 @@ from .base_metric import MahalanobisMixin, _PairsClassifierMixin from .constraints import Constraints, wrap_pairs +from ._util import check_tuples class _BaseSDML(MahalanobisMixin): @@ -43,8 +44,12 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, self.verbose = verbose def _prepare_pairs(self, pairs, y): + # for now we check_X_y and check_tuples but we should only + # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, - ensure_2d=False, allow_nd=True) + ensure_2d=False, allow_nd=True) + pairs = check_tuples(pairs) + # set up prior M if self.use_cov: X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])}) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 687000c0..fb54eb35 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -128,14 +128,21 @@ def test_score_pairs_finite(estimator, build_dataset): ids=ids_estimators) def test_score_pairs_dim(estimator, build_dataset): # scoring of 3D arrays should return 1D array (several tuples), - # and scoring of 2D arrays (one tuple) should return a scalar (0D array). + # and scoring of 2D arrays (one tuple) should return an error (like + # scikit-learn's error when scoring 1D arrays) inputs, labels = build_dataset() model = clone(estimator) model.fit(inputs, labels) X, _ = load_iris(return_X_y=True) tuples = np.array(list(product(X, X))) assert model.score_pairs(tuples).shape == (tuples.shape[0],) - assert np.isscalar(model.score_pairs(tuples[1])) + msg = ("Expected 3D array, got 2D array instead:\ntuples={}.\n" + "Reshape your data either using tuples.reshape(-1, {}, 1) if " + "your data has a single feature or tuples.reshape(1, {}, -1) " + "if it contains a single tuple.".format(tuples, tuples.shape[1], + tuples.shape[0])) + with pytest.raises(ValueError, message=msg): + model.score_pairs(tuples[1]) def check_is_distance_matrix(pairwise): @@ -174,13 +181,22 @@ def test_embed_dim(estimator, build_dataset): model.fit(inputs, labels) X, _ = load_iris(return_X_y=True) assert model.transform(X).shape == X.shape - assert model.transform(X[0, :]).shape == (len(X[0]),) + + # assert that ValueError is thrown if input shape is 1D + err_msg = ("Expected 2D array, got 1D array instead:\narray={}.\n" + "Reshape your data either using array.reshape(-1, 1) if " + "your data has a single feature or array.reshape(1, -1) " + "if it contains a single sample.".format(X)) + with pytest.raises(ValueError, message=err_msg): + model.score_pairs(model.transform(X[0, :])) # we test that the shape is also OK when doing dimensionality reduction if type(model).__name__ in {'LFDA', 'MLKR', 'NCA', 'RCA'}: model.set_params(num_dims=2) model.fit(inputs, labels) assert model.transform(X).shape == (X.shape[0], 2) - assert model.transform(X[0, :]).shape == (2,) + # assert that ValueError is thrown if input shape is 1D + with pytest.raises(ValueError, message=err_msg): + model.transform(model.transform(X[0, :])) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, diff --git a/test/test_utils.py b/test/test_utils.py new file mode 100644 index 00000000..da4b97ad --- /dev/null +++ b/test/test_utils.py @@ -0,0 +1,29 @@ +import numpy as np +import pytest +from metric_learn._util import check_tuples + + +def test_check_tuples(): + X = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) + check_tuples(X) + + X = np.array(5) + msg = ("Expected 3D array, got scalar instead. Cannot apply this function " + "on scalars.") + with pytest.raises(ValueError, message=msg): + check_tuples(X) + + X = np.array([1, 2, 3]) + msg = ("Expected 3D array, got 1D array instead:\ntuples=[1, 2, 3].\n" + "Reshape your data using tuples.reshape(1, -1, 1) if it contains a " + "single tuple and the points in the tuple have a single feature.") + with pytest.raises(ValueError, message=msg): + check_tuples(X) + + X = np.array([[1, 2, 3], [2, 3, 5]]) + msg = ("Expected 3D array, got 2D array instead:\ntuples=[[1, 2, 3], " + "[2, 3, 5]].\nReshape your data either using " + "tuples.reshape(-1, 3, 1) if your data has a single feature or " + "tuples.reshape(1, 2, -1) if it contains a single tuple.") + with pytest.raises(ValueError, message=msg): + check_tuples(X) diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 146df297..e7846fb4 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -169,17 +169,24 @@ def test_dict_unchanged(estimator, build_dataset): (tuples, y, tuples_train, tuples_test, y_train, y_test) = build_dataset() estimator = clone(estimator) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 + if hasattr(estimator, "num_dims"): + estimator.num_dims = 1 estimator.fit(tuples, y) - for method in ["predict", "transform", "decision_function", - "predict_proba"]: + for method in ["predict", "decision_function", "predict_proba"]: if hasattr(estimator, method): dict_before = estimator.__dict__.copy() getattr(estimator, method)(tuples) assert estimator.__dict__ == dict_before, \ ("Estimator changes __dict__ during %s" % method) + for method in ["transform"]: + if hasattr(estimator, method): + dict_before = estimator.__dict__.copy() + # we transform only 2D arrays (dataset of points) + getattr(estimator, method)(tuples[:, 0, :]) + assert estimator.__dict__ == dict_before, \ + ("Estimator changes __dict__ during %s" + % method) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, @@ -190,8 +197,8 @@ def test_dont_overwrite_parameters(estimator, build_dataset): (tuples, y, tuples_train, tuples_test, y_train, y_test) = build_dataset() estimator = clone(estimator) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 + if hasattr(estimator, "num_dims"): + estimator.num_dims = 1 dict_before_fit = estimator.__dict__.copy() estimator.fit(tuples, y) From af0a3aceda2d75c589c67c889ea9b3ef8bc4c3b0 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 24 Jul 2018 16:30:22 +0200 Subject: [PATCH 021/120] FIX: fix parenthesis --- metric_learn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 6e3d9a49..b0a8235d 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -38,7 +38,7 @@ def check_tuples(tuples): "Expected 3D array, got 1D array instead:\ntuples={}.\n" "Reshape your data using tuples.reshape(1, -1, 1) if it contains a " "single tuple and the points in the tuple have a single " - "feature.").format(tuples) + "feature.".format(tuples)) # If input is 2D raise error if len(tuples.shape) == 2: raise ValueError( From f1dd4c2e2fe785bc329cfea34db17e235884df2b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 21 Aug 2018 17:50:46 +0200 Subject: [PATCH 022/120] ENH: First commit adding a preprocessor --- metric_learn/_util.py | 146 ++++++++++++++++++++++++++++++------ metric_learn/base_metric.py | 80 ++++++++++++++------ metric_learn/itml.py | 20 ++++- metric_learn/lsml.py | 25 ++++-- metric_learn/mmc.py | 19 ++++- metric_learn/sdml.py | 19 ++++- 6 files changed, 244 insertions(+), 65 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index b0a8235d..3b18161e 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -1,4 +1,6 @@ import numpy as np +import six +from sklearn.utils import check_array # hack around lack of axis kwarg in older numpy versions @@ -12,39 +14,133 @@ def vector_norm(X): return np.linalg.norm(X, axis=1) -def check_tuples(tuples): - """Check that the input is a valid 3D array representing a dataset of tuples. - - Equivalent of `check_array` in scikit-learn. +def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", + order=None, copy=False, force_all_finite=True, + ensure_min_samples=1, ensure_min_features=1, + warn_on_dtype=False, estimator=None): + """Check that `tuples` is a valid array of tuples. Parameters ---------- tuples : object The tuples to check. + t : int + The number of elements in a tuple (e.g. 2 for pairs). + + dtype : string, type, list of types or None (default="auto") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. If + "auto", will we be set to "numeric" if `preprocessor=True`, + else to None. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean or 'allow-nan', (default=True) + Whether to raise an error on np.inf and np.nan in X. This parameter + does not influence whether y can have np.inf or np.nan values. + The possibilities are: + + - True: Force all values of X to be finite. + - False: accept both np.inf and np.nan in X. + - 'allow-nan': accept only np.nan values in X. Values cannot be + infinite. + + ensure_min_samples : int (default=1) + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when X has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. + + warn_on_dtype : boolean (default=False) + Raise DataConversionWarning if the dtype of the input data structure + does not match the requested dtype, causing a memory copy. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. + Returns ------- tuples_valid : object - The validated input. + The validated tuples. """ - # If input is scalar raise error - if len(tuples.shape) == 0: - raise ValueError( - "Expected 3D array, got scalar instead. Cannot apply this function on " - "scalars.") - # If input is 1D raise error - if len(tuples.shape) == 1: - raise ValueError( - "Expected 3D array, got 1D array instead:\ntuples={}.\n" - "Reshape your data using tuples.reshape(1, -1, 1) if it contains a " - "single tuple and the points in the tuple have a single " - "feature.".format(tuples)) - # If input is 2D raise error - if len(tuples.shape) == 2: - raise ValueError( - "Expected 3D array, got 2D array instead:\ntuples={}.\n" - "Reshape your data either using tuples.reshape(-1, {}, 1) if " - "your data has a single feature or tuples.reshape(1, {}, -1) " - "if it contains a single tuple.".format(tuples, tuples.shape[1], - tuples.shape[0])) + if dtype == "auto": + dtype = 'numeric' if preprocessor else None + + context = make_context(estimator, preprocessor) + tuples = check_array(tuples, dtype=dtype, accept_sparse=False, copy=copy, + force_all_finite=force_all_finite, + order=order, + ensure_2d=False, # tuples can be 2D or 3D + allow_nd=True, + ensure_min_samples=ensure_min_samples, + # ensure_min_features only works if ndim=2, so we will + # have to check again if input is 3D (see below) + ensure_min_features=ensure_min_features, + estimator=context, + warn_on_dtype=warn_on_dtype) + + if tuples.ndim == 2: # in this case there is left to check if t is OK + check_t(tuples, t, context) + elif tuples.ndim == 3: + # if the dimension is 3 we still have to check that the num_features is OK + check_array(tuples[:, 0, :], ensure_min_features=ensure_min_features, + estimator=context) + # then we should also check that t is OK + check_t(tuples, t, context) + else: + expected_shape = 2 if preprocessor else 3 + raise ValueError("{}D array expected{}. Found {}D array " + "instead:\ninput={}.\n" + .format(expected_shape, context, tuples.ndim, tuples)) return tuples + + +def make_context(estimator, preprocessor): + """Helper function to create a string with estimator name and tell if + it is using a preprocessor. For instance if using NCA and preprocessor it + will return 'NCA when using a preprocessor'""" + if estimator is not None: + if isinstance(estimator, six.string_types): + estimator_name = estimator + else: + estimator_name = estimator.__class__.__name__ + else: + estimator_name = None + context = ("by {}".format(estimator_name) if estimator_name is not None + else "") + with_preprocessor = ('when using {} preprocessor' + .format('a' if preprocessor else 'no')) + context += with_preprocessor + return context + + +def check_t(tuples, t, context): + """Helper function to check that the number of points in each tuple is + equal to t (e.g. 2 for pairs), and raise a `ValueError` otherwise""" + if t is not None and tuples.shape[1] != t: + msg_t = (("Tuples of {} elements expected{}. Got tuples of {} " + "elements instead (shape={}):\ninput={}.\n") + .format(t, context, tuples.shape[1], tuples.shape, tuples)) + raise ValueError(msg_t) + + +class SimplePreprocessor(): + + def __init__(self, X): + self.X = X + + def __call__(self, indices): + return self.X[indices] diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 559e4520..822c9e04 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -1,18 +1,32 @@ from numpy.linalg import cholesky from sklearn.base import BaseEstimator -from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_array, _is_arraylike from sklearn.metrics import roc_auc_score import numpy as np -from abc import ABCMeta, abstractmethod +from abc import ABCMeta import six -from ._util import check_tuples +from ._util import check_tuples, SimplePreprocessor class BaseMetricLearner(BaseEstimator): - def __init__(self): - raise NotImplementedError('BaseMetricLearner should not be instantiated') - @abstractmethod + def __init__(self, preprocessor=None): + """ + + Parameters + ---------- + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be gotten like this: X[indices]. + """ + self.preprocessor = preprocessor + + def check_preprocessor(self): + if _is_arraylike(self.preprocessor): + self.preprocessor_ = SimplePreprocessor(self.preprocessor) + else: + self.preprocessor_ = self.preprocessor + def score_pairs(self, pairs): """Returns the score between pairs (can be a similarity, or a distance/metric depending on the algorithm) @@ -28,6 +42,12 @@ def score_pairs(self, pairs): The score of every pair. """ + def format_input(self, input): + if self.preprocessor is not None: + return np.apply_along_axis(self.preprocessor_, 1, input) + else: + return input + class MetricTransformer(): @@ -79,8 +99,10 @@ def score_pairs(self, pairs): Parameters ---------- - pairs : `numpy.ndarray`, shape=(n_samples, 2, n_features) - 3D array of pairs, or 2D array of one pair. + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to score, with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. Returns ------- @@ -88,6 +110,7 @@ def score_pairs(self, pairs): The learned Mahalanobis distance for every pair. """ pairs = check_tuples(pairs) + pairs = self.format_input(pairs) pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) @@ -153,8 +176,10 @@ def predict(self, pairs): Parameters ---------- - pairs : array-like, shape=(n_constraints, 2, n_features) - Input pairs. + pairs: array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to predict, with each row corresponding to two + points, or 2D array of indices of pairs if the metric learner uses a + preprocessor. Returns ------- @@ -180,8 +205,10 @@ def score(self, pairs, y): Parameters ---------- - pairs : array-like, shape=(n_constraints, 2, n_features) - Input Pairs. + pairs: array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs, with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. y : array-like, shape=(n_constraints,) The corresponding labels. @@ -205,15 +232,18 @@ def predict(self, quadruplets): Parameters ---------- - quadruplets : array-like, shape=(n_constraints, 4, n_features) - Input quadruplets. + quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or + (n_quadruplets, 4) + 3D Array of quadruplets to predict, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. Returns ------- prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each quadruplet. """ - quadruplets = check_tuples(quadruplets) + quadruplets = check_tuples(quadruplets, preprocessor=self.preprocessor) return np.sign(self.decision_function(quadruplets)) def decision_function(self, quadruplets): @@ -224,8 +254,11 @@ def decision_function(self, quadruplets): Parameters ---------- - quadruplets : array-like, shape=(n_constraints, 4, n_features) - Input quadruplets. + quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or + (n_quadruplets, 4) + 3D Array of quadruplets to evaluate, with each row corresponding to + four points, or 2D array of indices of quadruplets if the metric + learner uses a preprocessor. Returns ------- @@ -233,8 +266,10 @@ def decision_function(self, quadruplets): Metric differences. """ quadruplets = check_tuples(quadruplets) - return (self.score_pairs(quadruplets[:, :2, :]) - - self.score_pairs(quadruplets[:, 2:, :])) + # we broadcast with ... because here we allow quadruplets to be + # either a 3D array of points or 2D array of indices + return (self.score_pairs(quadruplets[:, :2, ...]) - + self.score_pairs(quadruplets[:, 2:, ...])) def score(self, quadruplets, y=None): """Computes score on input quadruplets @@ -245,8 +280,11 @@ def score(self, quadruplets, y=None): Parameters ---------- - quadruplets : array-like, shape=(n_constraints, 4, n_features) - Input quadruplets. + quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or + (n_quadruplets, 4) + 3D Array of quadruplets to score, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. y : Ignored, for scikit-learn compatibility. diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 05620499..8dd1eb37 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -27,7 +27,7 @@ class _BaseITML(MahalanobisMixin): """Information Theoretic Metric Learning (ITML)""" def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, - A0=None, verbose=False): + A0=None, verbose=False, preprocessor=None): """Initialize ITML. Parameters @@ -44,19 +44,28 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, verbose : bool, optional if True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ self.gamma = gamma self.max_iter = max_iter self.convergence_threshold = convergence_threshold self.A0 = A0 self.verbose = verbose + super(_BaseITML, self).__init__(preprocessor) def _process_pairs(self, pairs, y, bounds): # for now we check_X_y and check_tuples but we should only # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) - pairs = check_tuples(pairs) + self.check_preprocessor() + pairs = check_tuples(pairs, preprocessor=self.preprocessor, t=2, + estimator=self) + # pairs classifiers and for quadruplets classifiers + pairs = self.format_input(pairs) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -143,8 +152,11 @@ def fit(self, pairs, y, bounds=None): Parameters ---------- - pairs: array-like, shape=(n_constraints, 2, n_features) - Array of pairs. Each row corresponds to two points. + pairs: array-like, shape=(n_constraints, 2, n_features) or + (n_constraints, 2) + 3D Array of pairs with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. y: array-like, of shape (n_constraints,) Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. bounds : list (pos,neg) pairs, optional diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 0fdafd84..bf0524ea 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -20,7 +20,8 @@ class _BaseLSML(MahalanobisMixin): - def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False): + def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, + preprocessor=None): """Initialize LSML. Parameters @@ -31,18 +32,25 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False): guess at a metric [default: inv(covariance(X))] verbose : bool, optional if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ self.prior = prior self.tol = tol self.max_iter = max_iter self.verbose = verbose + super(_BaseLSML, self).__init__(preprocessor) def _prepare_quadruplets(self, quadruplets, weights): # for now we check_array and check_tuples but we should only # check_tuples in the future (with enhanced check_tuples) quadruplets = check_array(quadruplets, accept_sparse=False, ensure_2d=False, allow_nd=True) - quadruplets = check_tuples(quadruplets) + self.check_preprocessor() + quadruplets = check_tuples(quadruplets, preprocessor=self.preprocessor, + t=4, estimator=self) + quadruplets = self.format_input(quadruplets) # check to make sure that no two constrained vectors are identical self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :] @@ -135,11 +143,14 @@ def fit(self, quadruplets, weights=None): Parameters ---------- - quadruplets : array-like, shape=(n_constraints, 4, n_features) - Each row corresponds to 4 points. In order to supervise the - algorithm in the right way, we should have the four samples ordered - in a way such that: d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) - for all 0 <= i < n_constraints. + quadruplets : array-like, shape=(n_constraints, 4, n_features) or + (n_constraints, 4) + 3D array with each row (element took from the first dimension) + corresponding to 4 points. In order to supervise the algorithm in the + right way, we should have the four samples ordered in a way such that: + d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i < + n_constraints. If the instance was created with a preprocessor, it can + also be fitted on 2D arrays of indices of quadruplets. weights : (n_constraints,) array of floats, optional scale factor for each constraint diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 921402ce..98ca91bc 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -30,7 +30,8 @@ class _BaseMMC(MahalanobisMixin): """Mahalanobis Metric for Clustering (MMC)""" def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, - A0=None, diagonal=False, diagonal_c=1.0, verbose=False): + A0=None, diagonal=False, diagonal_c=1.0, verbose=False, + preprocessor=None): """Initialize MMC. Parameters ---------- @@ -48,6 +49,9 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, metric learning verbose : bool, optional if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be gotten like this: X[indices]. """ self.max_iter = max_iter self.max_proj = max_proj @@ -56,8 +60,10 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, self.diagonal = diagonal self.diagonal_c = diagonal_c self.verbose = verbose + super(_BaseMMC, self).__init__(preprocessor) def _fit(self, pairs, y): + self.check_preprocessor() pairs, y = self._process_pairs(pairs, y) if self.diagonal: return self._fit_diag(pairs, y) @@ -69,7 +75,9 @@ def _process_pairs(self, pairs, y): # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) - pairs = check_tuples(pairs) + pairs = check_tuples(pairs, preprocessor=self.preprocessor, t=2, + estimator=self) + pairs = self.format_input(pairs) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -365,8 +373,11 @@ def fit(self, pairs, y): Parameters ---------- - pairs: array-like, shape=(n_constraints, 2, n_features) - Array of pairs. Each row corresponds to two points. + pairs: array-like, shape=(n_constraints, 2, n_features) or + (n_constraints, 2) + 3D Array of pairs with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. y: array-like, of shape (n_constraints,) Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index b91b6b27..8fe0cf06 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -22,7 +22,7 @@ class _BaseSDML(MahalanobisMixin): def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, - verbose=False): + verbose=False, preprocessor=None): """ Parameters ---------- @@ -37,18 +37,26 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, verbose : bool, optional if True, prints information while learning + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be gotten like this: X[indices]. """ self.balance_param = balance_param self.sparsity_param = sparsity_param self.use_cov = use_cov self.verbose = verbose + super(_BaseSDML, self).__init__(preprocessor) def _prepare_pairs(self, pairs, y): # for now we check_X_y and check_tuples but we should only # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) - pairs = check_tuples(pairs) + self.check_preprocessor() + pairs = check_tuples(pairs, preprocessor=self.preprocessor, t=2, + estimator=self) + pairs = self.format_input(pairs) # set up prior M if self.use_cov: @@ -78,8 +86,11 @@ def fit(self, pairs, y): Parameters ---------- - pairs: array-like, shape=(n_constraints, 2, n_features) - Array of pairs. Each row corresponds to two points. + pairs: array-like, shape=(n_constraints, 2, n_features) or + (n_constraints, 2) + 3D Array of pairs with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. y: array-like, of shape (n_constraints,) Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. From 47fbf465647ba31676d95335283e6c6ed6a94051 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 23 Aug 2018 12:02:08 +0200 Subject: [PATCH 023/120] ENH: Improve check_tuples with more comments and deal better with ensure_min_features --- metric_learn/_util.py | 49 ++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 3b18161e..7a2db7df 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -58,11 +58,10 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", axis (rows for a 2D array). ensure_min_features : int (default=1) - Make sure that the 2D array has some minimum number of features - (columns). The default value of 1 rejects empty datasets. - This check is only enforced when X has effectively 2 dimensions or - is originally 1D and ``ensure_2d`` is True. Setting to 0 disables - this check. + Only used when using no preprocessor. Make sure that each point in the 3D + array of tuples has some minimum number of features (axis=2). The default + value of 1 rejects empty datasets. This check is only enforced when X has + effectively 3 dimensions. Setting to 0 disables this check. warn_on_dtype : boolean (default=False) Raise DataConversionWarning if the dtype of the input data structure @@ -79,7 +78,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", if dtype == "auto": dtype = 'numeric' if preprocessor else None - context = make_context(estimator, preprocessor) + context = make_name(estimator, preprocessor) tuples = check_array(tuples, dtype=dtype, accept_sparse=False, copy=copy, force_all_finite=force_all_finite, order=order, @@ -88,7 +87,10 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", ensure_min_samples=ensure_min_samples, # ensure_min_features only works if ndim=2, so we will # have to check again if input is 3D (see below) - ensure_min_features=ensure_min_features, + ensure_min_features = 0, + # if 2D and preprocessor, no notion of + # "features". If 3D and no preprocessor, min_features + # is checked below estimator=context, warn_on_dtype=warn_on_dtype) @@ -102,37 +104,36 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", check_t(tuples, t, context) else: expected_shape = 2 if preprocessor else 3 - raise ValueError("{}D array expected{}. Found {}D array " + raise ValueError("{} expected {}D array. Found {}D array " "instead:\ninput={}.\n" - .format(expected_shape, context, tuples.ndim, tuples)) + .format(context, expected_shape, tuples.ndim, tuples)) return tuples -def make_context(estimator, preprocessor): - """Helper function to create a string with estimator name and tell if - it is using a preprocessor. For instance if using NCA and preprocessor it - will return 'NCA when using a preprocessor'""" +def make_name(estimator, preprocessor): + """Helper function to create a string with the estimator name and tell if + it is using a preprocessor. Will return the following for instance: + NCA + preprocessor: 'NCA's preprocessor' + NCA + no preprocessor: 'NCA' + None + preprocessor: 'a preprocessor' + None + None: None""" if estimator is not None: + with_preprocessor = "'s preprocessor" if preprocessor else '' if isinstance(estimator, six.string_types): - estimator_name = estimator + estimator_name = estimator + with_preprocessor else: - estimator_name = estimator.__class__.__name__ + estimator_name = estimator.__class__.__name__ + with_preprocessor else: - estimator_name = None - context = ("by {}".format(estimator_name) if estimator_name is not None - else "") - with_preprocessor = ('when using {} preprocessor' - .format('a' if preprocessor else 'no')) - context += with_preprocessor - return context + estimator_name = None if not preprocessor else 'a preprocessor' + return estimator_name def check_t(tuples, t, context): """Helper function to check that the number of points in each tuple is equal to t (e.g. 2 for pairs), and raise a `ValueError` otherwise""" if t is not None and tuples.shape[1] != t: - msg_t = (("Tuples of {} elements expected{}. Got tuples of {} " - "elements instead (shape={}):\ninput={}.\n") + msg_t = (("Tuples of {} element(s) expected{}. Got tuples of {} " + "element(s) instead (shape={}):\ninput={}.\n") .format(t, context, tuples.shape[1], tuples.shape, tuples)) raise ValueError(msg_t) From 8eb419a42b2b76dc2d3801f5815c9a77fc390e82 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 23 Aug 2018 13:53:55 +0200 Subject: [PATCH 024/120] STY: remove unexpected spaces --- metric_learn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 7a2db7df..1e3f051c 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -87,7 +87,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", ensure_min_samples=ensure_min_samples, # ensure_min_features only works if ndim=2, so we will # have to check again if input is 3D (see below) - ensure_min_features = 0, + ensure_min_features=0, # if 2D and preprocessor, no notion of # "features". If 3D and no preprocessor, min_features # is checked below From 033da6070324c046d71db828fd97fa91cf27b58b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 23 Aug 2018 13:58:40 +0200 Subject: [PATCH 025/120] FIX: Raise more appropriate error message The previous error message would have said "[...], shape=(shape_of_the_2D_array_extracted_from_3D)" But it is clearer to print the shape of the actual 3D initial array (tuples in input) --- metric_learn/_util.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 1e3f051c..2895a49b 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -98,8 +98,13 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", check_t(tuples, t, context) elif tuples.ndim == 3: # if the dimension is 3 we still have to check that the num_features is OK - check_array(tuples[:, 0, :], ensure_min_features=ensure_min_features, - estimator=context) + if ensure_min_features > 0: + n_features = array.shape[2] + if n_features < ensure_min_features: + raise ValueError("Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required%s." + % (n_features, shape_repr, ensure_min_features, + context)) # then we should also check that t is OK check_t(tuples, t, context) else: From 6fae2625fe6f189516f7c9594ffcab7c745146c5 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 23 Aug 2018 14:33:01 +0200 Subject: [PATCH 026/120] FIX: fix string formatting and refactor name and context to use context in custom functions but to give name to check_array --- metric_learn/_util.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 2895a49b..43db125d 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -78,7 +78,8 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", if dtype == "auto": dtype = 'numeric' if preprocessor else None - context = make_name(estimator, preprocessor) + name = make_name(estimator, preprocessor) + context = ' by ' + name if name is not None else '' tuples = check_array(tuples, dtype=dtype, accept_sparse=False, copy=copy, force_all_finite=force_all_finite, order=order, @@ -91,7 +92,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", # if 2D and preprocessor, no notion of # "features". If 3D and no preprocessor, min_features # is checked below - estimator=context, + estimator=name, warn_on_dtype=warn_on_dtype) if tuples.ndim == 2: # in this case there is left to check if t is OK @@ -99,17 +100,17 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", elif tuples.ndim == 3: # if the dimension is 3 we still have to check that the num_features is OK if ensure_min_features > 0: - n_features = array.shape[2] + n_features = tuples.shape[2] if n_features < ensure_min_features: - raise ValueError("Found array with %d feature(s) (shape=%s) while" - " a minimum of %d is required%s." - % (n_features, shape_repr, ensure_min_features, - context)) + raise ValueError("Found array with {} feature(s) (shape={}) while" + " a minimum of {} is required{}." + .format(n_features, tuples.shape, ensure_min_features, + context)) # then we should also check that t is OK check_t(tuples, t, context) else: expected_shape = 2 if preprocessor else 3 - raise ValueError("{} expected {}D array. Found {}D array " + raise ValueError("{}D array expected{}. Found {}D array " "instead:\ninput={}.\n" .format(context, expected_shape, tuples.ndim, tuples)) return tuples From 758d4cccf27934de8cef11becfdb782090026d8f Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 24 Aug 2018 13:45:06 +0200 Subject: [PATCH 027/120] FIX: only allow 2D if preprocessor and 3D if not preprocessor --- metric_learn/_util.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 43db125d..25b51723 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -95,9 +95,10 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", estimator=name, warn_on_dtype=warn_on_dtype) - if tuples.ndim == 2: # in this case there is left to check if t is OK + if tuples.ndim == 2 and preprocessor: # in this case there is left to check + # if t is OK check_t(tuples, t, context) - elif tuples.ndim == 3: + elif tuples.ndim == 3 and not preprocessor: # if the dimension is 3 we still have to check that the num_features is OK if ensure_min_features > 0: n_features = tuples.shape[2] From deb6d5d3ba4f7c71ee7673af2ff97fbf82a3903b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 24 Aug 2018 16:41:20 +0200 Subject: [PATCH 028/120] FIX: put format arguments in the right order --- metric_learn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 25b51723..ee59e986 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -113,7 +113,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", expected_shape = 2 if preprocessor else 3 raise ValueError("{}D array expected{}. Found {}D array " "instead:\ninput={}.\n" - .format(context, expected_shape, tuples.ndim, tuples)) + .format(expected_shape, context, tuples.ndim, tuples)) return tuples From 01ee0816d805ae04fc24d376e57e6e7f484b4435 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 24 Aug 2018 16:44:40 +0200 Subject: [PATCH 029/120] MAINT: better to say the preprocessor than a preprocessor in messages --- metric_learn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index ee59e986..704c5bb7 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -131,7 +131,7 @@ def make_name(estimator, preprocessor): else: estimator_name = estimator.__class__.__name__ + with_preprocessor else: - estimator_name = None if not preprocessor else 'a preprocessor' + estimator_name = None if not preprocessor else 'the preprocessor' return estimator_name From 92f9651463a670a05e8401ea78fc65a091b6758d Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 24 Aug 2018 17:16:40 +0200 Subject: [PATCH 030/120] FIX: numeric should be default if NO preprocessor --- metric_learn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 704c5bb7..2a1240cf 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -76,7 +76,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", The validated tuples. """ if dtype == "auto": - dtype = 'numeric' if preprocessor else None + dtype = 'numeric' if not preprocessor else None name = make_name(estimator, preprocessor) context = ' by ' + name if name is not None else '' From 4c41c375ee87d847a1cc487a42ac3e49036c27a9 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 28 Aug 2018 14:33:33 +0200 Subject: [PATCH 031/120] FIX: preprocessor argument has to be a boolean (before this change was a callable or an array) --- metric_learn/itml.py | 4 ++-- metric_learn/lsml.py | 3 ++- metric_learn/mmc.py | 4 ++-- metric_learn/sdml.py | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 8dd1eb37..4bbe9663 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -62,8 +62,8 @@ def _process_pairs(self, pairs, y, bounds): pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) self.check_preprocessor() - pairs = check_tuples(pairs, preprocessor=self.preprocessor, t=2, - estimator=self) + pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, + t=2, estimator=self) # pairs classifiers and for quadruplets classifiers pairs = self.format_input(pairs) diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index bf0524ea..f16f06e2 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -48,7 +48,8 @@ def _prepare_quadruplets(self, quadruplets, weights): quadruplets = check_array(quadruplets, accept_sparse=False, ensure_2d=False, allow_nd=True) self.check_preprocessor() - quadruplets = check_tuples(quadruplets, preprocessor=self.preprocessor, + quadruplets = check_tuples(quadruplets, + preprocessor=self.preprocessor_ is not None, t=4, estimator=self) quadruplets = self.format_input(quadruplets) diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 98ca91bc..2eda4cc8 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -75,8 +75,8 @@ def _process_pairs(self, pairs, y): # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) - pairs = check_tuples(pairs, preprocessor=self.preprocessor, t=2, - estimator=self) + pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, + t=2, estimator=self) pairs = self.format_input(pairs) # check to make sure that no two constrained vectors are identical diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 8fe0cf06..a3e63558 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -54,8 +54,8 @@ def _prepare_pairs(self, pairs, y): pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) self.check_preprocessor() - pairs = check_tuples(pairs, preprocessor=self.preprocessor, t=2, - estimator=self) + pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, + t=2, estimator=self) pairs = self.format_input(pairs) # set up prior M From 4b7e89b1a2beaea53ed466b15dd739435ca68e62 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 28 Aug 2018 15:38:42 +0200 Subject: [PATCH 032/120] FIX: fix preprocessor argument passed in check_tuples in base_metric --- metric_learn/base_metric.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 822c9e04..3aee3c50 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -109,7 +109,8 @@ def score_pairs(self, pairs): scores: `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. """ - pairs = check_tuples(pairs) + pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) + # TODO: add a check (and a test) to only be able to score if t is OK pairs = self.format_input(pairs) pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the @@ -186,11 +187,11 @@ def predict(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted learned metric value between samples in every pair. """ - pairs = check_tuples(pairs) + pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) return self.score_pairs(pairs) def decision_function(self, pairs): - pairs = check_tuples(pairs) + pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) return self.predict(pairs) def score(self, pairs, y): @@ -218,7 +219,7 @@ def score(self, pairs, y): score : float The ``roc_auc`` score. """ - pairs = check_tuples(pairs) + pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) return roc_auc_score(y, self.decision_function(pairs)) @@ -243,7 +244,8 @@ def predict(self, quadruplets): prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each quadruplet. """ - quadruplets = check_tuples(quadruplets, preprocessor=self.preprocessor) + quadruplets = check_tuples(quadruplets, + preprocessor=self.preprocessor_ is not None) return np.sign(self.decision_function(quadruplets)) def decision_function(self, quadruplets): @@ -265,7 +267,8 @@ def decision_function(self, quadruplets): decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ - quadruplets = check_tuples(quadruplets) + quadruplets = check_tuples(quadruplets, + preprocessor=self.preprocessor_ is not None) # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices return (self.score_pairs(quadruplets[:, :2, ...]) - @@ -293,5 +296,6 @@ def score(self, quadruplets, y=None): score : float The quadruplets score. """ - quadruplets = check_tuples(quadruplets) + quadruplets = check_tuples(quadruplets, + preprocessor=self.preprocessor_ is not None) return - np.mean(self.predict(quadruplets)) From 609b80ea9109f265dfaa0fa7b48657ab68410eff Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 31 Aug 2018 14:20:45 +0200 Subject: [PATCH 033/120] MAINT: say a preprocessor rather than the preprocessor --- metric_learn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 2a1240cf..034a03e1 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -131,7 +131,7 @@ def make_name(estimator, preprocessor): else: estimator_name = estimator.__class__.__name__ + with_preprocessor else: - estimator_name = None if not preprocessor else 'the preprocessor' + estimator_name = None if not preprocessor else 'a preprocessor' return estimator_name From 4342660b13512b1bbc10c871d05cf2778239f89b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 4 Sep 2018 12:03:40 +0200 Subject: [PATCH 034/120] DOC: fix docstring of t in check_tuples --- metric_learn/_util.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 034a03e1..5a34e47d 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -25,8 +25,9 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", tuples : object The tuples to check. - t : int - The number of elements in a tuple (e.g. 2 for pairs). + t : int or None (default=None) + The number of elements to ensure there is in every tuple (e.g. 2 for + pairs). If None, the number of tuples is not checked. dtype : string, type, list of types or None (default="auto") Data type of result. If None, the dtype of the input is preserved. From e50cbaec8581f8771b92ba4c3c3b1b81cc27979f Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Sep 2018 11:34:51 +0200 Subject: [PATCH 035/120] MAINT: make error messages better by only printing presence of preprocessor if the error is because not 2D or 3D shape --- metric_learn/_util.py | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 5a34e47d..017c6592 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -79,8 +79,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", if dtype == "auto": dtype = 'numeric' if not preprocessor else None - name = make_name(estimator, preprocessor) - context = ' by ' + name if name is not None else '' + context = make_context(estimator) tuples = check_array(tuples, dtype=dtype, accept_sparse=False, copy=copy, force_all_finite=force_all_finite, order=order, @@ -93,7 +92,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", # if 2D and preprocessor, no notion of # "features". If 3D and no preprocessor, min_features # is checked below - estimator=name, + estimator=estimator, warn_on_dtype=warn_on_dtype) if tuples.ndim == 2 and preprocessor: # in this case there is left to check @@ -112,28 +111,32 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", check_t(tuples, t, context) else: expected_shape = 2 if preprocessor else 3 - raise ValueError("{}D array expected{}. Found {}D array " + with_prep = (' when using {} preprocessor'.format('a' if preprocessor + else 'no')) + raise ValueError("{}D array expected{}{}. Found {}D array " "instead:\ninput={}.\n" - .format(expected_shape, context, tuples.ndim, tuples)) + .format(expected_shape, context, with_prep, + tuples.ndim, tuples)) return tuples -def make_name(estimator, preprocessor): - """Helper function to create a string with the estimator name and tell if - it is using a preprocessor. Will return the following for instance: - NCA + preprocessor: 'NCA's preprocessor' - NCA + no preprocessor: 'NCA' - None + preprocessor: 'a preprocessor' - None + None: None""" +def make_context(estimator): + """Helper function to create a string with the estimator name. + Taken from check_array function in scikit-learn. + Will return the following for instance: + NCA: ' by NCA' + 'NCA': ' by NCA' + None: '' + """ if estimator is not None: - with_preprocessor = "'s preprocessor" if preprocessor else '' - if isinstance(estimator, six.string_types): - estimator_name = estimator + with_preprocessor - else: - estimator_name = estimator.__class__.__name__ + with_preprocessor + if isinstance(estimator, six.string_types): + estimator_name = estimator + else: + estimator_name = estimator.__class__.__name__ else: - estimator_name = None if not preprocessor else 'a preprocessor' - return estimator_name + estimator_name = None + context = ' by ' + estimator_name if estimator_name is not None else '' + return context def check_t(tuples, t, context): From 56838b48427909695b6d343701a7cf7ffc380e8c Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Sep 2018 11:37:29 +0200 Subject: [PATCH 036/120] TST: Add tests for check_tuples --- test/test_utils.py | 270 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 248 insertions(+), 22 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 8ca3aac3..1554cdd2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,29 +1,255 @@ -import numpy as np import pytest -from metric_learn._util import check_tuples +import numpy as np +from sklearn.exceptions import DataConversionWarning +from metric_learn import NCA +from metric_learn._util import check_tuples, make_context -def test_check_tuples(): - X = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) - check_tuples(X) +@pytest.fixture() +def X_prep(): + """Basic array for testing when using a preprocessor""" + X = np.array([[1, 2], + [2, 3]]) + return X - X = 5 - msg = ("Expected 3D array, got scalar instead. Cannot apply this function " - "on scalars.") - with pytest.raises(ValueError, message=msg): - check_tuples(X) - X = np.array([1, 2, 3]) - msg = ("Expected 3D array, got 1D array instead:\ntuples=[1, 2, 3].\n" - "Reshape your data using tuples.reshape(1, -1, 1) if it contains a " - "single tuple and the points in the tuple have a single feature.") - with pytest.raises(ValueError, message=msg): - check_tuples(X) +@pytest.fixture() +def X_no_prep(): + """Basic array for testing when using no preprocessor""" + X = np.array([[[1., 2.3], [2.3, 5.3]], + [[2.3, 4.3], [0.2, 0.4]]]) + return X + + +@pytest.mark.parametrize('estimator, expected', [(NCA(), " by NCA"), + ('NCA', " by NCA"), + (None, "")]) +def test_make_context(estimator, expected): + """test the make_name function""" + assert make_context(estimator) == expected + + +@pytest.mark.parametrize('estimator, preprocessor, by_context, load_X', + [(NCA(), True, " by NCA", X_prep), + ('NCA', True, " by NCA", X_prep), + (NCA(), False, " by NCA", X_no_prep), + ('NCA', False, " by NCA", X_no_prep), + (None, True, "", X_prep), + (None, False, "", X_no_prep)]) +def test_check_tuples_name_name_in_messages(estimator, preprocessor, + by_context, load_X): + """Checks that exceptions/warnings include the name of the estimator""" + + X = load_X() + + # if t is different than expected + msg = ("Tuples of 1 element(s) expected{}. Got tuples of 2 element(s) " + "instead (shape={}):\ninput={}.\n").format(by_context, X.shape, X) + with pytest.raises(ValueError) as raised_error: + check_tuples(X, t=1, preprocessor=preprocessor, estimator=estimator) + assert str(raised_error.value) == msg + + # if shape not 2D or 3D + four_d_array = [[[[3]]]] + msg = ("{}D array expected{}{}. Found 4D array instead:\ninput=[[[[3]]]].\n" + .format(2 if preprocessor else 3, by_context, + ' when using {} preprocessor' + .format('a' if preprocessor else 'no'))) + with pytest.raises(ValueError) as raised_error: + check_tuples(four_d_array, preprocessor=preprocessor, estimator=estimator) + assert str(raised_error.value) == msg + + # if n_features too small + if not preprocessor: # n_features is checked only if using no preprocessor + msg = ("Found array with 2 feature(s) (shape=(2, 2, 2)) while" + " a minimum of 3 is required{}.").format(by_context) + with pytest.raises(ValueError) as raised_error: + check_tuples(X, preprocessor=preprocessor, estimator=estimator, + ensure_min_features=3) + assert str(raised_error.value) == msg + + # if n_samples too small + msg = ("Found array with 2 sample(s) (shape={}) while" + " a minimum of 3 is required{}.").format(X.shape, by_context) + with pytest.raises(ValueError) as raised_error: + check_tuples(X, preprocessor=preprocessor, estimator=estimator, + ensure_min_samples=3) + assert str(raised_error.value) == msg + + # if dtype different than required but convertible, and warn_on_dtype == True + X_object = X.astype(object) + msg = ("Data with input dtype object was converted to float64{}." + .format(by_context)) + with pytest.warns(DataConversionWarning) as raised_warning: + check_tuples(X_object, preprocessor=preprocessor, estimator=estimator, + dtype=float, warn_on_dtype=True) + assert str(raised_warning[0].message) == msg + + +@pytest.mark.parametrize('load_X, preprocessor', + [(X_prep, True), (X_no_prep, False)]) +def test_check_tuples_invalid_t(load_X, preprocessor): + """Checks that the exception are raised if t is not the one expected""" + X = load_X() + expected_msg = ("Tuples of 3 element(s) expected. Got tuples of 2 " + "element(s) instead (shape={}):\ninput={}.\n" + .format(X.shape, X)) + with pytest.raises(ValueError) as raised_error: + check_tuples(X, t=3, preprocessor=preprocessor) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('X, found, expected, preprocessor', + [(5, '0', '2', True), + (5, '0', '3', False), + ([1, 2], '1', '2', True), + ([1, 2], '1', '3', False), + ([[[[5]]]], '4', '2', True), + ([[[[5]]]], '4', '3', False), + ([[1], [3]], '2', '3', False), + ([[[1], [3]]], '3', '2', True)]) +def test_check_tuples_invalid_shape(X, found, expected, preprocessor): + """Checks that a value error with the appropriate message is raised if + shape is invalid (not 2D with preprocessor or 3D with no preprocessor) + """ + X = np.array(X) + msg = ("{}D array expected when using {} preprocessor. Found {}D array " + "instead:\ninput={}.\n" + .format(expected, 'a' if preprocessor else 'no', found, X)) + with pytest.raises(ValueError) as raised_error: + check_tuples(X, preprocessor=preprocessor, ensure_min_samples=0) + assert str(raised_error.value) == msg + + +def test_check_tuples_invalid_n_features(X_no_prep): + """Checks that the right warning is printed if not enough features + Here we only test if no preprocessor (otherwise we don't ensure this) + """ + msg = ("Found array with 2 feature(s) (shape=(2, 2, 2)) while" + " a minimum of 3 is required.") + with pytest.raises(ValueError) as raised_error: + check_tuples(X_no_prep, preprocessor=False, ensure_min_features=3) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('load_X, preprocessor', + [(X_prep, True), (X_no_prep, False)]) +def test_check_tuples_invalid_n_samples(load_X, preprocessor): + """Checks that the right warning is printed if n_samples is too small""" + X = load_X() + msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " + "is required.".format(X.shape)) + with pytest.raises(ValueError) as raised_error: + check_tuples(X, preprocessor=preprocessor, ensure_min_samples=3) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('load_X, preprocessor', + [(X_prep, True), (X_no_prep, False)]) +def test_check_tuples_invalid_dtype_convertible(load_X, preprocessor): + """Checks that a warning is raised if a convertible input is converted to + float""" + X = load_X().astype(object) + msg = ("Data with input dtype object was converted to float64.") + with pytest.warns(DataConversionWarning) as raised_warning: + check_tuples(X, preprocessor=preprocessor, dtype=np.float64, + warn_on_dtype=True) + assert str(raised_warning[0].message) == msg + + +@pytest.mark.parametrize('preprocessor, X', + [(True, np.array([['a', 'b'], + ['e', 'b']])), + (False, np.array([[['b', 'v'], ['a', 'd']], + [['x', 'u'], ['c', 'a']]]))]) +def test_check_tuples_invalid_dtype_not_convertible(preprocessor, X): + """Checks that a value error is thrown if attempting to convert an + input not convertible to float + """ + with pytest.raises(ValueError): + check_tuples(X, preprocessor=preprocessor, dtype=np.float64) + + +@pytest.mark.parametrize('t', [2, None]) +def test_check_tuples_valid_t(t, X_prep, X_no_prep): + """For inputs that have the right matrix dimension (2D or 3D for instance), + checks that checking the number of tuples (pairs, quadruplets, etc) raises + no warning + """ + with pytest.warns(None) as record: + check_tuples(X_prep, preprocessor=True, t=t) + check_tuples(X_no_prep, preprocessor=False, t=t) + assert len(record) == 0 + + +@pytest.mark.parametrize('X', + [np.array([[2.5, 0.1, 2.6], + [1.6, 4.8, 9.1]]), + np.array([[2, 0, 2], + [1, 4, 9]]), + np.array([["img1.png", "img3.png"], + ["img2.png", "img4.png"]]), + [[2, 0, 2], + [1, 4, 9]], + [np.array([2, 0, 2]), + np.array([1, 4, 9])], + ((2, 0, 2), + (1, 4, 9)) + ]) +def test_check_tuples_valid_with_preprocessor(X): + """Test that valid inputs when using a preprocessor raises no warning""" + with pytest.warns(None) as record: + check_tuples(X, preprocessor=True) + assert len(record) == 0 + + +@pytest.mark.parametrize('X', + [np.array([[[2.5], [0.1], [2.6]], + [[1.6], [4.8], [9.1]], + [[5.6], [2.8], [6.1]]]), + np.array([[[2], [0], [2]], + [[1], [4], [9]], + [[1], [5], [3]]]), + [[[2], [0], [2]], + [[1], [4], [9]], + [[3], [4], [29]]], + (((2, 1), (0, 2), (2, 3)), + ((1, 2), (4, 4), (9, 3)), + ((3, 1), (4, 4), (29, 4)))]) +def test_check_tuples_valid_without_preprocessor(X): + """Test that valid inputs when using no preprocessor raises no warning""" + with pytest.warns(None) as record: + check_tuples(X, preprocessor=False) + assert len(record) == 0 + + +def test_check_tuples_behaviour_auto_dtype(X_no_prep): + """Checks that check_tuples allows by default every type if using a + preprocessor, and numeric types if using no preprocessor""" + X_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']] + with pytest.warns(None) as record: + check_tuples(X_prep, preprocessor=True) + assert len(record) == 0 + + with pytest.warns(None) as record: + check_tuples(X_no_prep) # numeric type + assert len(record) == 0 + + # not numeric type + X_no_prep = np.array([[['img1.png'], ['img2.png']], + [['img3.png'], ['img5.png']]]) + X_no_prep = X_no_prep.astype(object) + with pytest.raises(ValueError): + check_tuples(X_no_prep) + - X = np.array([[1, 2, 3], [2, 3, 5]]) - msg = ("Expected 3D array, got 2D array instead:\ntuples=[[1, 2, 3], " - "[2, 3, 5]].\nReshape your data either using " - "tuples.reshape(-1, 3, 1) if your data has a single feature or " - "tuples.reshape(1, 2, -1) if it contains a single tuple.") - with pytest.raises(ValueError, message=msg): +def test_check_tuples_invalid_complex_data(): + """Checks that the right error message is thrown if given complex data ( + this comes from sklearn's check_array's message)""" + X = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], + [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]]) + msg = ("Complex data not supported\n" + "{}\n".format(X)) + with pytest.raises(ValueError) as raised_error: check_tuples(X) + assert str(raised_error.value) == msg From 9f05c245c7bbf57d0330dab796f086089fd63340 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Sep 2018 12:08:57 +0200 Subject: [PATCH 037/120] TST: simplify tests by removing the test for messages with the estimator name, but incorporating it in all other tests through parametrization --- test/test_utils.py | 115 +++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 78 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 1554cdd2..bac0c837 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,84 +21,30 @@ def X_no_prep(): return X -@pytest.mark.parametrize('estimator, expected', [(NCA(), " by NCA"), - ('NCA', " by NCA"), - (None, "")]) +@pytest.mark.parametrize('estimator, expected', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) def test_make_context(estimator, expected): """test the make_name function""" assert make_context(estimator) == expected -@pytest.mark.parametrize('estimator, preprocessor, by_context, load_X', - [(NCA(), True, " by NCA", X_prep), - ('NCA', True, " by NCA", X_prep), - (NCA(), False, " by NCA", X_no_prep), - ('NCA', False, " by NCA", X_no_prep), - (None, True, "", X_prep), - (None, False, "", X_no_prep)]) -def test_check_tuples_name_name_in_messages(estimator, preprocessor, - by_context, load_X): - """Checks that exceptions/warnings include the name of the estimator""" - - X = load_X() - - # if t is different than expected - msg = ("Tuples of 1 element(s) expected{}. Got tuples of 2 element(s) " - "instead (shape={}):\ninput={}.\n").format(by_context, X.shape, X) - with pytest.raises(ValueError) as raised_error: - check_tuples(X, t=1, preprocessor=preprocessor, estimator=estimator) - assert str(raised_error.value) == msg - - # if shape not 2D or 3D - four_d_array = [[[[3]]]] - msg = ("{}D array expected{}{}. Found 4D array instead:\ninput=[[[[3]]]].\n" - .format(2 if preprocessor else 3, by_context, - ' when using {} preprocessor' - .format('a' if preprocessor else 'no'))) - with pytest.raises(ValueError) as raised_error: - check_tuples(four_d_array, preprocessor=preprocessor, estimator=estimator) - assert str(raised_error.value) == msg - - # if n_features too small - if not preprocessor: # n_features is checked only if using no preprocessor - msg = ("Found array with 2 feature(s) (shape=(2, 2, 2)) while" - " a minimum of 3 is required{}.").format(by_context) - with pytest.raises(ValueError) as raised_error: - check_tuples(X, preprocessor=preprocessor, estimator=estimator, - ensure_min_features=3) - assert str(raised_error.value) == msg - - # if n_samples too small - msg = ("Found array with 2 sample(s) (shape={}) while" - " a minimum of 3 is required{}.").format(X.shape, by_context) - with pytest.raises(ValueError) as raised_error: - check_tuples(X, preprocessor=preprocessor, estimator=estimator, - ensure_min_samples=3) - assert str(raised_error.value) == msg - - # if dtype different than required but convertible, and warn_on_dtype == True - X_object = X.astype(object) - msg = ("Data with input dtype object was converted to float64{}." - .format(by_context)) - with pytest.warns(DataConversionWarning) as raised_warning: - check_tuples(X_object, preprocessor=preprocessor, estimator=estimator, - dtype=float, warn_on_dtype=True) - assert str(raised_warning[0].message) == msg - - +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_X, preprocessor', [(X_prep, True), (X_no_prep, False)]) -def test_check_tuples_invalid_t(load_X, preprocessor): +def test_check_tuples_invalid_t(estimator, context, load_X, preprocessor): """Checks that the exception are raised if t is not the one expected""" X = load_X() - expected_msg = ("Tuples of 3 element(s) expected. Got tuples of 2 " + expected_msg = ("Tuples of 3 element(s) expected{}. Got tuples of 2 " "element(s) instead (shape={}):\ninput={}.\n" - .format(X.shape, X)) + .format(context, X.shape, X)) with pytest.raises(ValueError) as raised_error: - check_tuples(X, t=3, preprocessor=preprocessor) + check_tuples(X, t=3, preprocessor=preprocessor, estimator=estimator) assert str(raised_error.value) == expected_msg +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('X, found, expected, preprocessor', [(5, '0', '2', True), (5, '0', '3', False), @@ -108,52 +54,65 @@ def test_check_tuples_invalid_t(load_X, preprocessor): ([[[[5]]]], '4', '3', False), ([[1], [3]], '2', '3', False), ([[[1], [3]]], '3', '2', True)]) -def test_check_tuples_invalid_shape(X, found, expected, preprocessor): +def test_check_tuples_invalid_shape(estimator, context, X, found, expected, + preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (not 2D with preprocessor or 3D with no preprocessor) """ X = np.array(X) - msg = ("{}D array expected when using {} preprocessor. Found {}D array " + msg = ("{}D array expected{} when using {} preprocessor. Found {}D array " "instead:\ninput={}.\n" - .format(expected, 'a' if preprocessor else 'no', found, X)) + .format(expected, context, 'a' if preprocessor else 'no', found, X)) with pytest.raises(ValueError) as raised_error: - check_tuples(X, preprocessor=preprocessor, ensure_min_samples=0) + check_tuples(X, preprocessor=preprocessor, ensure_min_samples=0, + estimator=estimator) assert str(raised_error.value) == msg -def test_check_tuples_invalid_n_features(X_no_prep): +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +def test_check_tuples_invalid_n_features(estimator, context, X_no_prep): """Checks that the right warning is printed if not enough features Here we only test if no preprocessor (otherwise we don't ensure this) """ - msg = ("Found array with 2 feature(s) (shape=(2, 2, 2)) while" - " a minimum of 3 is required.") + msg = ("Found array with 2 feature(s) (shape={}) while" + " a minimum of 3 is required{}.".format(X_no_prep.shape, context)) with pytest.raises(ValueError) as raised_error: - check_tuples(X_no_prep, preprocessor=False, ensure_min_features=3) + check_tuples(X_no_prep, preprocessor=False, ensure_min_features=3, + estimator=estimator) assert str(raised_error.value) == msg +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_X, preprocessor', [(X_prep, True), (X_no_prep, False)]) -def test_check_tuples_invalid_n_samples(load_X, preprocessor): +def test_check_tuples_invalid_n_samples(estimator, context, load_X, + preprocessor): """Checks that the right warning is printed if n_samples is too small""" X = load_X() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " - "is required.".format(X.shape)) + "is required{}.".format(X.shape, context)) with pytest.raises(ValueError) as raised_error: - check_tuples(X, preprocessor=preprocessor, ensure_min_samples=3) + check_tuples(X, preprocessor=preprocessor, ensure_min_samples=3, + estimator=estimator) assert str(raised_error.value) == msg +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_X, preprocessor', [(X_prep, True), (X_no_prep, False)]) -def test_check_tuples_invalid_dtype_convertible(load_X, preprocessor): +def test_check_tuples_invalid_dtype_convertible(estimator, context, load_X, + preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" X = load_X().astype(object) - msg = ("Data with input dtype object was converted to float64.") + msg = ("Data with input dtype object was converted to float64{}." + .format(context)) with pytest.warns(DataConversionWarning) as raised_warning: check_tuples(X, preprocessor=preprocessor, dtype=np.float64, - warn_on_dtype=True) + warn_on_dtype=True, estimator=estimator) assert str(raised_warning[0].message) == msg From 12ce8ac86a173269e510fb2e7c31caba00e4d6b5 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Sep 2018 15:34:57 +0200 Subject: [PATCH 038/120] STY: remove unnecessary parenthesis --- test/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index bac0c837..a5fb4644 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -5,7 +5,7 @@ from metric_learn._util import check_tuples, make_context -@pytest.fixture() +@pytest.fixture def X_prep(): """Basic array for testing when using a preprocessor""" X = np.array([[1, 2], @@ -13,7 +13,7 @@ def X_prep(): return X -@pytest.fixture() +@pytest.fixture def X_no_prep(): """Basic array for testing when using no preprocessor""" X = np.array([[[1., 2.3], [2.3, 5.3]], From 62a989cd98dab90c7f5966e7f30713036b8ea1e3 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Sep 2018 16:23:34 +0200 Subject: [PATCH 039/120] FIX: put back else statement that probably was wrongfully merged --- metric_learn/base_metric.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index c9f4129d..84fe919a 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -186,6 +186,7 @@ def transformer_from_metric(self, metric): return np.sqrt(metric) elif not np.isclose(np.linalg.det(metric), 0): return cholesky(metric).T + else: w, V = np.linalg.eigh(metric) return V.T * np.sqrt(np.maximum(0, w[:, None])) From 33c5d8b5b406a9506bc1daae6060da39ec611986 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Sep 2018 16:46:22 +0200 Subject: [PATCH 040/120] TST: add tests for weakly supervised estimators and preprocessor that fetches indices --- test/test_weakly_supervised.py | 108 ++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 41 deletions(-) diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 8cae4bfc..185918c3 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -26,35 +26,47 @@ def build_data(): return X, pairs -def build_pairs(): +def build_pairs(preprocessor): # test that you can do cross validation on tuples of points with # a WeaklySupervisedMetricLearner X, pairs = build_data() - pairs, y = wrap_pairs(X, pairs) + if preprocessor is not None: + # if preprocessor, we build a 2D array of pairs of indices + _, y = wrap_pairs(X, pairs) + pairs = np.vstack([np.column_stack(pairs[:2]), np.column_stack(pairs[2:])]) + else: + # if not, we build a 3D array of pairs of samples + pairs, y = wrap_pairs(X, pairs) pairs, y = shuffle(pairs, y, random_state=RNG) (pairs_train, pairs_test, y_train, y_test) = train_test_split(pairs, y, random_state=RNG) - return (pairs, y, pairs_train, pairs_test, - y_train, y_test) + return (X, pairs, y, pairs_train, pairs_test, + y_train, y_test, preprocessor) -def build_quadruplets(): +def build_quadruplets(preprocessor): # test that you can do cross validation on a tuples of points with # a WeaklySupervisedMetricLearner X, pairs = build_data() c = np.column_stack(pairs) - quadruplets = X[c] + if preprocessor is not None: + # if preprocessor, we build a 2D array of quadruplets of indices + quadruplets = c + else: + # if not, we build a 3D array of quadruplets of samples + quadruplets = X[c] quadruplets = shuffle(quadruplets, random_state=RNG) y = y_train = y_test = None quadruplets_train, quadruplets_test = train_test_split(quadruplets, random_state=RNG) - return (quadruplets, y, quadruplets_train, quadruplets_test, - y_train, y_test) + return (X, quadruplets, y, quadruplets_train, quadruplets_test, + y_train, y_test, preprocessor) list_estimators = [(ITML(), build_pairs), (LSML(), build_quadruplets), - (MMC(), build_pairs), + (MMC(max_iter=2), build_pairs), # max_iter=2 for faster + # testing (SDML(), build_pairs) ] @@ -65,12 +77,14 @@ def build_quadruplets(): ] +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_cross_validation(estimator, build_dataset): - (tuples, y, tuples_train, tuples_test, - y_train, y_test) = build_dataset() +def test_cross_validation(estimator, build_dataset, preprocessor): + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) assert np.isfinite(cross_val_score(estimator, tuples, y)).all() @@ -86,12 +100,14 @@ def check_predict(estimator, tuples): assert len(y_predicted), len(tuples) +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_simple_estimator(estimator, build_dataset): - (tuples, y, tuples_train, tuples_test, - y_train, y_test) = build_dataset() +def test_simple_estimator(estimator, build_dataset, preprocessor): + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) estimator.fit(tuples_train, y_train) @@ -101,9 +117,11 @@ def test_simple_estimator(estimator, build_dataset): @pytest.mark.parametrize('estimator', [est[0] for est in list_estimators], ids=ids_estimators) -def test_no_attributes_set_in_init(estimator): - """Check setting during init. Taken from scikit-learn.""" +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +def test_no_attributes_set_in_init(estimator, preprocessor): + """Check setting during init. Adapted from scikit-learn.""" estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) if hasattr(type(estimator).__init__, "deprecated_original"): return @@ -129,25 +147,29 @@ def test_no_attributes_set_in_init(estimator): "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_estimators_fit_returns_self(estimator, build_dataset): +def test_estimators_fit_returns_self(estimator, build_dataset, preprocessor): """Check if self is returned when calling fit""" - # From scikit-learn - (tuples, y, tuples_train, tuples_test, - y_train, y_test) = build_dataset() + # Adapted from scikit-learn + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) assert estimator.fit(tuples, y) is estimator +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_pipeline_consistency(estimator, build_dataset): - # From scikit learn +def test_pipeline_consistency(estimator, build_dataset, preprocessor): + # Adapted from scikit learn # check that make_pipeline(est) gives same score as est - (tuples, y, tuples_train, tuples_test, - y_train, y_test) = build_dataset() + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) pipeline = make_pipeline(estimator) estimator.fit(tuples, y) pipeline.fit(tuples, y) @@ -163,13 +185,15 @@ def test_pipeline_consistency(estimator, build_dataset): assert_allclose_dense_sparse(result, result_pipe) +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_dict_unchanged(estimator, build_dataset): - # From scikit-learn - (tuples, y, tuples_train, tuples_test, - y_train, y_test) = build_dataset() +def test_dict_unchanged(estimator, build_dataset, preprocessor): + # Adapted from scikit-learn + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): estimator.num_dims = 1 estimator.fit(tuples, y) @@ -180,24 +204,26 @@ def test_dict_unchanged(estimator, build_dataset): assert estimator.__dict__ == dict_before, \ ("Estimator changes __dict__ during %s" % method) - for method in ["transform"]: - if hasattr(estimator, method): - dict_before = estimator.__dict__.copy() - # we transform only 2D arrays (dataset of points) - getattr(estimator, method)(tuples[:, 0, :]) - assert estimator.__dict__ == dict_before, \ - ("Estimator changes __dict__ during %s" - % method) + for method in ["transform"]: + if hasattr(estimator, method): + dict_before = estimator.__dict__.copy() + # we transform only 2D arrays (dataset of points) + getattr(estimator, method)(X) + assert estimator.__dict__ == dict_before, \ + ("Estimator changes __dict__ during %s" + % method) +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_dont_overwrite_parameters(estimator, build_dataset): - # From scikit-learn +def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): + # Adapted from scikit-learn # check that fit method only changes or sets private attributes - (tuples, y, tuples_train, tuples_test, - y_train, y_test) = build_dataset() + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): estimator.num_dims = 1 dict_before_fit = estimator.__dict__.copy() From 5eba5fa3d5518d00049419c9a36679f560bb45e9 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 7 Sep 2018 10:13:47 +0200 Subject: [PATCH 041/120] TST: add tests for preprocessor --- test/test_weakly_supervised.py | 70 ++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 185918c3..f9e7486a 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -15,6 +15,9 @@ RNG = check_random_state(0) +# ---------------------- Test scikit-learn compatibility ---------------------- + + def build_data(): dataset = load_iris() X, y = shuffle(dataset.data, dataset.target, random_state=RNG) @@ -277,3 +280,70 @@ def _get_args(function, varargs=False): return args, varargs else: return args + + +# ----------------------------- Test preprocessor ----------------------------- + + +X = np.array([[0.89, 0.11, 1.48, 0.12], + [2.63, 1.08, 1.68, 0.46], + [1.00, 0.59, 0.62, 1.15]]) + + +class MockFileLoader: + """Preprocessor that takes a root file path at construction and simulates + fetching the file in the specific root folder when given the name of the + file""" + + def __init__(self, root): + self.root = root + self.folders = {'fake_root': {'img0.png': X[0], + 'img1.png': X[1], + 'img2.png': X[2] + }, + 'other_folder': {} # empty folder + } + + def __call__(self, path_list): + images = list() + for path in path_list: + images.append(self.folders[self.root][path]) + return np.array(images) + + +def mock_id_loader(list_of_indicators): + """A preprocessor as a function that takes indicators (strings) and + returns the corresponding samples""" + points = [] + for indicator in list_of_indicators: + points.append(X[int(indicator[2:])]) + return np.array(points) + + +tuples_list = [np.array([[0, 1], + [2, 1]]), + + np.array([['img0.png', 'img1.png'], + ['img2.png', 'img1.png']]), + + np.array([['id0', 'id1'], + ['id2', 'id1']]) + ] + +preprocessors = [X, MockFileLoader('fake_root'), mock_id_loader] + + +@pytest.fixture +def y_tuples(): + y = [-1, 1] + return y + + +@pytest.mark.parametrize('preprocessor, tuples', zip(preprocessors, + tuples_list)) +def test_preprocessor(preprocessor, tuples, y_tuples): + """Tests different ways to use the preprocessor argument: an array, + a class callable, and a function callable + """ + nca = ITML(preprocessor=preprocessor) + nca.fit(tuples, y_tuples) From 42b34e059de641cf94eb893aa539e5802edb0a08 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 13 Sep 2018 11:31:39 +0200 Subject: [PATCH 042/120] FIX: remove redundant metric and transformer function, wrongly merged --- metric_learn/base_metric.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 84fe919a..59648431 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -43,29 +43,6 @@ def check_preprocessor(self): else: self.preprocessor_ = self.preprocessor - def metric(self): - """Computes the Mahalanobis matrix from the transformation matrix. - - .. math:: M = L^{\\top} L - - Returns - ------- - M : (d x d) matrix - """ - L = self.transformer() - return L.T.dot(L) - - def transformer(self): - """Computes the transformation matrix from the Mahalanobis matrix. - - L = cholesky(M).T - - Returns - ------- - L : upper triangular (d x d) matrix - """ - return cholesky(self.metric()).T - def format_input(self, input): if self.preprocessor is not None: return np.apply_along_axis(self.preprocessor_, 1, input) From a380cd320e0cdc900e34f5cd05d75f00da794ce1 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 13 Sep 2018 11:33:24 +0200 Subject: [PATCH 043/120] MAINT: rename format_input into preprocess_tuples and input into tuples --- metric_learn/base_metric.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 59648431..8c0ddc08 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -43,11 +43,12 @@ def check_preprocessor(self): else: self.preprocessor_ = self.preprocessor - def format_input(self, input): + def preprocess_tuples(self, tuples): if self.preprocessor is not None: - return np.apply_along_axis(self.preprocessor_, 1, input) + return np.apply_along_axis(self.preprocessor_, 1, tuples) else: - return input + return tuples + class MetricTransformer(six.with_metaclass(ABCMeta)): From 54d1710cee6a8828fdce535278734d6cdb3d8b47 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 13 Sep 2018 14:45:56 +0200 Subject: [PATCH 044/120] MAINT: fixes and enhancements - fix the format_input previous incomplete refactoring - mututalize check_tuples for Weakly Supervised Algorithms - fix test of string representations --- metric_learn/base_metric.py | 31 ++++++++++++++++++--------- metric_learn/covariance.py | 6 ++++-- metric_learn/itml.py | 6 +++--- metric_learn/lfda.py | 6 +++++- metric_learn/lmnn.py | 5 ++++- metric_learn/lsml.py | 7 ++++--- metric_learn/mlkr.py | 5 ++++- metric_learn/mmc.py | 7 ++++--- metric_learn/nca.py | 7 ++++++- metric_learn/rca.py | 10 ++++++--- metric_learn/sdml.py | 7 ++++--- test/test_base_metric.py | 42 ++++++++++++++++++++++--------------- 12 files changed, 91 insertions(+), 48 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 8c0ddc08..71df8898 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -113,7 +113,7 @@ def score_pairs(self, pairs): """ pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) # TODO: add a check (and a test) to only be able to score if t is OK - pairs = self.format_input(pairs) + pairs = self.preprocess_tuples(pairs) pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) @@ -169,7 +169,18 @@ def transformer_from_metric(self, metric): return V.T * np.sqrt(np.maximum(0, w[:, None])) -class _PairsClassifierMixin(BaseMetricLearner): +class _WeaklySupervisedMixin(BaseMetricLearner): + + _t = None # number of points in a tuple, None by default + + def _check_tuples(self, tuples): + return check_tuples(tuples, preprocessor=self.preprocessor_ is not None, + estimator=self.__class__.__name__, t=self._t) + + +class _PairsClassifierMixin(_WeaklySupervisedMixin): + + _t = 2 # number of points in a tuple, 2 for pairs def predict(self, pairs): """Predicts the learned metric between input pairs. @@ -189,11 +200,11 @@ def predict(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted learned metric value between samples in every pair. """ - pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) + pairs = self._check_tuples(pairs) return self.score_pairs(pairs) def decision_function(self, pairs): - pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) + pairs = self._check_tuples(pairs) return self.predict(pairs) def score(self, pairs, y): @@ -221,11 +232,13 @@ def score(self, pairs, y): score : float The ``roc_auc`` score. """ - pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) + pairs = self._check_tuples(pairs) return roc_auc_score(y, self.decision_function(pairs)) -class _QuadrupletsClassifierMixin(BaseMetricLearner): +class _QuadrupletsClassifierMixin(_WeaklySupervisedMixin): + + _t = 4 # number of points in a tuple, 4 for quadruplets def predict(self, quadruplets): """Predicts the ordering between sample distances in input quadruplets. @@ -246,8 +259,7 @@ def predict(self, quadruplets): prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each quadruplet. """ - quadruplets = check_tuples(quadruplets, - preprocessor=self.preprocessor_ is not None) + quadruplets = self._check_tuples(quadruplets) # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices return (self.score_pairs(quadruplets[:, :2, ...]) - @@ -278,6 +290,5 @@ def score(self, quadruplets, y=None): score : float The quadruplets score. """ - quadruplets = check_tuples(quadruplets, - preprocessor=self.preprocessor_ is not None) + quadruplets = self._check_tuples(quadruplets) return -np.mean(self.predict(quadruplets)) diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 4e8c1a0f..88d0838d 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -26,14 +26,16 @@ class Covariance(MahalanobisMixin, TransformerMixin): metric (See :meth:`transformer_from_metric`.) """ - def __init__(self): - pass + def __init__(self, preprocessor=None): + super(Covariance, self).__init__(preprocessor) def fit(self, X, y=None): """ X : data matrix, (n x d) y : unused """ + self.check_preprocessor() + self.X_ = check_array(X, ensure_min_samples=2) self.M_ = np.cov(self.X_, rowvar = False) if self.M_.ndim == 0: diff --git a/metric_learn/itml.py b/metric_learn/itml.py index c12b5a17..d4761c33 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -65,7 +65,7 @@ def _process_pairs(self, pairs, y, bounds): pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, t=2, estimator=self) # pairs classifiers and for quadruplets classifiers - pairs = self.format_input(pairs) + pairs = self.preprocess_tuples(pairs) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -190,7 +190,7 @@ class ITML_Supervised(_BaseITML, TransformerMixin): def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, num_labeled=np.inf, num_constraints=None, bounds=None, A0=None, - verbose=False): + verbose=False, preprocessor=None): """Initialize the learner. Parameters @@ -212,7 +212,7 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, """ _BaseITML.__init__(self, gamma=gamma, max_iter=max_iter, convergence_threshold=convergence_threshold, - A0=A0, verbose=verbose) + A0=A0, verbose=verbose, preprocessor=preprocessor) self.num_labeled = num_labeled self.num_constraints = num_constraints self.bounds = bounds diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index c06fca91..cc7daa24 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -32,7 +32,8 @@ class LFDA(MahalanobisMixin, TransformerMixin): The learned linear transformation ``L``. ''' - def __init__(self, num_dims=None, k=None, embedding_type='weighted'): + def __init__(self, num_dims=None, k=None, embedding_type='weighted', + preprocessor=None): ''' Initialize LFDA. @@ -56,6 +57,7 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted'): self.num_dims = num_dims self.embedding_type = embedding_type self.k = k + super(LFDA, self).__init__(preprocessor) def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) @@ -91,6 +93,8 @@ def fit(self, X, y): y : (n,) array-like Class labels, one per point of data. ''' + self.check_preprocessor() + X, y, num_classes, n, d, dim, k_ = self._process_inputs(X, y) tSb = np.zeros((d,d)) tSw = np.zeros((d,d)) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 7ce4d051..f74c7469 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -24,7 +24,7 @@ class _base_LMNN(MahalanobisMixin, TransformerMixin): def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, regularization=0.5, convergence_tol=0.001, use_pca=True, - verbose=False): + verbose=False, preprocessor=None): """Initialize the LMNN object. Parameters @@ -43,6 +43,7 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, self.convergence_tol = convergence_tol self.use_pca = use_pca self.verbose = verbose + super(_base_LMNN, self).__init__(preprocessor) # slower Python version @@ -64,6 +65,8 @@ def _process_inputs(self, X, labels): ' (smallest class has %d)' % required_k) def fit(self, X, y): + self.check_preprocessor() + k = self.k reg = self.regularization learn_rate = self.learn_rate diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index f8b84f1f..bff07e86 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -51,7 +51,7 @@ def _prepare_quadruplets(self, quadruplets, weights): quadruplets = check_tuples(quadruplets, preprocessor=self.preprocessor_ is not None, t=4, estimator=self) - quadruplets = self.format_input(quadruplets) + quadruplets = self.preprocess_tuples(quadruplets) # check to make sure that no two constrained vectors are identical self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :] @@ -182,7 +182,8 @@ class LSML_Supervised(_BaseLSML, TransformerMixin): """ def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf, - num_constraints=None, weights=None, verbose=False): + num_constraints=None, weights=None, verbose=False, + preprocessor=None): """Initialize the learner. Parameters @@ -201,7 +202,7 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf, if True, prints information while learning """ _BaseLSML.__init__(self, tol=tol, max_iter=max_iter, prior=prior, - verbose=verbose) + verbose=verbose, preprocessor=preprocessor) self.num_labeled = num_labeled self.num_constraints = num_constraints self.weights = weights diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 9f774322..1a752365 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -30,7 +30,7 @@ class MLKR(MahalanobisMixin, TransformerMixin): """ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, - max_iter=1000): + max_iter=1000, preprocessor=None): """ Initialize MLKR. @@ -56,6 +56,7 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, self.epsilon = epsilon self.alpha = alpha self.max_iter = max_iter + super(MLKR, self).__init__(preprocessor) def _process_inputs(self, X, y): self.X_, y = check_X_y(X, y) @@ -86,6 +87,8 @@ def fit(self, X, y): X : (n x d) array of samples y : (n) data labels """ + self.check_preprocessor() + X, y, A = self._process_inputs(X, y) # note: this line takes (n*n*d) memory! diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 1271537b..21341c8b 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -77,7 +77,7 @@ def _process_pairs(self, pairs, y): ensure_2d=False, allow_nd=True) pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, t=2, estimator=self) - pairs = self.format_input(pairs) + pairs = self.preprocess_tuples(pairs) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -409,7 +409,8 @@ class MMC_Supervised(_BaseMMC, TransformerMixin): def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6, num_labeled=np.inf, num_constraints=None, - A0=None, diagonal=False, diagonal_c=1.0, verbose=False): + A0=None, diagonal=False, diagonal_c=1.0, verbose=False, + preprocessor=None): """Initialize the learner. Parameters @@ -436,7 +437,7 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6, _BaseMMC.__init__(self, max_iter=max_iter, max_proj=max_proj, convergence_threshold=convergence_threshold, A0=A0, diagonal=diagonal, diagonal_c=diagonal_c, - verbose=verbose) + verbose=verbose, preprocessor=preprocessor) self.num_labeled = num_labeled self.num_constraints = num_constraints diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 19e016ec..e50497d5 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -23,16 +23,21 @@ class NCA(MahalanobisMixin, TransformerMixin): The learned linear transformation ``L``. """ - def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01): + def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01, + preprocessor=None): self.num_dims = num_dims self.max_iter = max_iter self.learning_rate = learning_rate + super(NCA, self).__init__(preprocessor) + def fit(self, X, y): """ X: data matrix, (n x d) y: scalar labels, (n) """ + self.check_preprocessor() + X, labels = check_X_y(X, y) n, d = X.shape num_dims = self.num_dims diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 170e21f8..34a368fd 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -45,7 +45,7 @@ class RCA(MahalanobisMixin, TransformerMixin): The learned linear transformation ``L``. """ - def __init__(self, num_dims=None, pca_comps=None): + def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): """Initialize the learner. Parameters @@ -62,6 +62,7 @@ def __init__(self, num_dims=None, pca_comps=None): """ self.num_dims = num_dims self.pca_comps = pca_comps + super(RCA, self).__init__(preprocessor) def _process_data(self, X): self.X_ = X = check_array(X) @@ -108,6 +109,8 @@ def fit(self, data, chunks): When ``chunks[i] == -1``, point i doesn't belong to any chunklet. When ``chunks[i] == j``, point i belongs to chunklet j. """ + self.check_preprocessor() + data, M_pca = self._process_data(data) chunks = np.asanyarray(chunks, dtype=int) @@ -150,7 +153,7 @@ class RCA_Supervised(RCA): """ def __init__(self, num_dims=None, pca_comps=None, num_chunks=100, - chunk_size=2): + chunk_size=2, preprocessor=None): """Initialize the learner. Parameters @@ -160,7 +163,8 @@ def __init__(self, num_dims=None, pca_comps=None, num_chunks=100, num_chunks: int, optional chunk_size: int, optional """ - RCA.__init__(self, num_dims=num_dims, pca_comps=pca_comps) + RCA.__init__(self, num_dims=num_dims, pca_comps=pca_comps, + preprocessor=preprocessor) self.num_chunks = num_chunks self.chunk_size = chunk_size diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 061ba070..558adb65 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -56,7 +56,7 @@ def _prepare_pairs(self, pairs, y): self.check_preprocessor() pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, t=2, estimator=self) - pairs = self.format_input(pairs) + pairs = self.preprocess_tuples(pairs) # set up prior M if self.use_cov: @@ -121,7 +121,8 @@ class SDML_Supervised(_BaseSDML, TransformerMixin): """ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, - num_labeled=np.inf, num_constraints=None, verbose=False): + num_labeled=np.inf, num_constraints=None, verbose=False, + preprocessor=None): """ Parameters ---------- @@ -140,7 +141,7 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, """ _BaseSDML.__init__(self, balance_param=balance_param, sparsity_param=sparsity_param, use_cov=use_cov, - verbose=verbose) + verbose=verbose, preprocessor=preprocessor) self.num_labeled = num_labeled self.num_constraints = num_constraints diff --git a/test/test_base_metric.py b/test/test_base_metric.py index 31db4e6f..8964a2c3 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -5,73 +5,81 @@ class TestStringRepr(unittest.TestCase): def test_covariance(self): - self.assertEqual(str(metric_learn.Covariance()), "Covariance()") + self.assertEqual(str(metric_learn.Covariance()), + "Covariance(preprocessor=None)") def test_lmnn(self): self.assertRegexpMatches( str(metric_learn.LMNN()), + r"(python_)?LMNN\(convergence_tol=0.001, k=3, learn_rate=1e-07, " - r"max_iter=1000,\n min_iter=50, regularization=0.5, " - r"use_pca=True, verbose=False\)") + r"max_iter=1000,\n min_iter=50, preprocessor=None, " + r"regularization=0.5, use_pca=True,\n verbose=False\)") def test_nca(self): self.assertEqual(str(metric_learn.NCA()), - "NCA(learning_rate=0.01, max_iter=100, num_dims=None)") + "NCA(learning_rate=0.01, max_iter=100, num_dims=None, " + "preprocessor=None)") def test_lfda(self): self.assertEqual(str(metric_learn.LFDA()), - "LFDA(embedding_type='weighted', k=None, num_dims=None)") + "LFDA(embedding_type='weighted', k=None, num_dims=None, " + "preprocessor=None)") def test_itml(self): self.assertEqual(str(metric_learn.ITML()), """ ITML(A0=None, convergence_threshold=0.001, gamma=1.0, max_iter=1000, - verbose=False) + preprocessor=None, verbose=False) """.strip('\n')) self.assertEqual(str(metric_learn.ITML_Supervised()), """ ITML_Supervised(A0=None, bounds=None, convergence_threshold=0.001, gamma=1.0, max_iter=1000, num_constraints=None, num_labeled=inf, - verbose=False) + preprocessor=None, verbose=False) """.strip('\n')) def test_lsml(self): self.assertEqual( str(metric_learn.LSML()), - "LSML(max_iter=1000, prior=None, tol=0.001, verbose=False)") + "LSML(max_iter=1000, preprocessor=None, prior=None, tol=0.001, " + "verbose=False)") self.assertEqual(str(metric_learn.LSML_Supervised()), """ LSML_Supervised(max_iter=1000, num_constraints=None, num_labeled=inf, - prior=None, tol=0.001, verbose=False, weights=None) + preprocessor=None, prior=None, tol=0.001, verbose=False, + weights=None) """.strip('\n')) def test_sdml(self): self.assertEqual(str(metric_learn.SDML()), - "SDML(balance_param=0.5, sparsity_param=0.01, " - "use_cov=True, verbose=False)") + "SDML(balance_param=0.5, preprocessor=None, " + "sparsity_param=0.01, use_cov=True,\n verbose=False)") self.assertEqual(str(metric_learn.SDML_Supervised()), """ SDML_Supervised(balance_param=0.5, num_constraints=None, num_labeled=inf, - sparsity_param=0.01, use_cov=True, verbose=False) + preprocessor=None, sparsity_param=0.01, use_cov=True, + verbose=False) """.strip('\n')) def test_rca(self): self.assertEqual(str(metric_learn.RCA()), - "RCA(num_dims=None, pca_comps=None)") + "RCA(num_dims=None, pca_comps=None, preprocessor=None)") self.assertEqual(str(metric_learn.RCA_Supervised()), "RCA_Supervised(chunk_size=2, num_chunks=100, " - "num_dims=None, pca_comps=None)") + "num_dims=None, pca_comps=None,\n " + "preprocessor=None)") def test_mlkr(self): self.assertEqual(str(metric_learn.MLKR()), "MLKR(A0=None, alpha=0.0001, epsilon=0.01, " - "max_iter=1000, num_dims=None)") + "max_iter=1000, num_dims=None,\n preprocessor=None)") def test_mmc(self): self.assertEqual(str(metric_learn.MMC()), """ MMC(A0=None, convergence_threshold=0.001, diagonal=False, diagonal_c=1.0, - max_iter=100, max_proj=10000, verbose=False) + max_iter=100, max_proj=10000, preprocessor=None, verbose=False) """.strip('\n')) self.assertEqual(str(metric_learn.MMC_Supervised()), """ MMC_Supervised(A0=None, convergence_threshold=1e-06, diagonal=False, diagonal_c=1.0, max_iter=100, max_proj=10000, num_constraints=None, - num_labeled=inf, verbose=False) + num_labeled=inf, preprocessor=None, verbose=False) """.strip('\n')) if __name__ == '__main__': From 3b716f0a2f6da94ab42bac52789d07a4b7de57fe Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 13 Sep 2018 16:09:47 +0200 Subject: [PATCH 045/120] MAINT: mutualize check_tuples --- metric_learn/base_metric.py | 7 +++++-- metric_learn/itml.py | 3 +-- metric_learn/lsml.py | 4 +--- metric_learn/mmc.py | 3 +-- metric_learn/sdml.py | 3 +-- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 71df8898..1c921261 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -49,6 +49,10 @@ def preprocess_tuples(self, tuples): else: return tuples + def _check_tuples(self, tuples, t=None): + return check_tuples(tuples, preprocessor=self.preprocessor_ is not None, + estimator=self.__class__.__name__, t=t) + class MetricTransformer(six.with_metaclass(ABCMeta)): @@ -174,8 +178,7 @@ class _WeaklySupervisedMixin(BaseMetricLearner): _t = None # number of points in a tuple, None by default def _check_tuples(self, tuples): - return check_tuples(tuples, preprocessor=self.preprocessor_ is not None, - estimator=self.__class__.__name__, t=self._t) + return super(_WeaklySupervisedMixin, self)._check_tuples(tuples, t=self._t) class _PairsClassifierMixin(_WeaklySupervisedMixin): diff --git a/metric_learn/itml.py b/metric_learn/itml.py index d4761c33..4ff6799c 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -62,8 +62,7 @@ def _process_pairs(self, pairs, y, bounds): pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) self.check_preprocessor() - pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, - t=2, estimator=self) + pairs = self._check_tuples(pairs) # pairs classifiers and for quadruplets classifiers pairs = self.preprocess_tuples(pairs) diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index bff07e86..3c14e504 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -48,9 +48,7 @@ def _prepare_quadruplets(self, quadruplets, weights): quadruplets = check_array(quadruplets, accept_sparse=False, ensure_2d=False, allow_nd=True) self.check_preprocessor() - quadruplets = check_tuples(quadruplets, - preprocessor=self.preprocessor_ is not None, - t=4, estimator=self) + quadruplets = self._check_tuples(quadruplets) quadruplets = self.preprocess_tuples(quadruplets) # check to make sure that no two constrained vectors are identical diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 21341c8b..2deba45a 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -75,8 +75,7 @@ def _process_pairs(self, pairs, y): # check_tuples_y in the future pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) - pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, - t=2, estimator=self) + pairs = self._check_tuples(pairs) pairs = self.preprocess_tuples(pairs) # check to make sure that no two constrained vectors are identical diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 558adb65..b89760e6 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -54,8 +54,7 @@ def _prepare_pairs(self, pairs, y): pairs, y = check_X_y(pairs, y, accept_sparse=False, ensure_2d=False, allow_nd=True) self.check_preprocessor() - pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None, - t=2, estimator=self) + pairs = self._check_tuples(pairs) pairs = self.preprocess_tuples(pairs) # set up prior M From bac835e71e701e96b66c423eee5e41933d97f732 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 14 Sep 2018 11:57:43 +0200 Subject: [PATCH 046/120] MAINT: refactor SimplePreprocessor into ArrayIndexer --- metric_learn/_util.py | 2 +- metric_learn/base_metric.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 017c6592..74914ab3 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -149,7 +149,7 @@ def check_t(tuples, t, context): raise ValueError(msg_t) -class SimplePreprocessor(): +class ArrayIndexer(): def __init__(self, X): self.X = X diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 1c921261..17cb66a0 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -5,7 +5,7 @@ import numpy as np from abc import ABCMeta, abstractmethod import six -from ._util import check_tuples, SimplePreprocessor +from ._util import check_tuples, ArrayIndexer class BaseMetricLearner(six.with_metaclass(ABCMeta, BaseEstimator)): @@ -39,7 +39,7 @@ def score_pairs(self, pairs): def check_preprocessor(self): if _is_arraylike(self.preprocessor): - self.preprocessor_ = SimplePreprocessor(self.preprocessor) + self.preprocessor_ = ArrayIndexer(self.preprocessor) else: self.preprocessor_ = self.preprocessor From 2b0f495275abf8e6901875560f3e6b782f31a786 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 21 Sep 2018 16:30:54 +0200 Subject: [PATCH 047/120] MAINT: improve check_tuples and tests --- metric_learn/_util.py | 413 ++++++++++++++++++- metric_learn/base_metric.py | 60 ++- metric_learn/covariance.py | 7 +- metric_learn/itml.py | 25 +- metric_learn/lfda.py | 10 +- metric_learn/lmnn.py | 21 +- metric_learn/lsml.py | 20 +- metric_learn/mlkr.py | 11 +- metric_learn/mmc.py | 23 +- metric_learn/nca.py | 8 +- metric_learn/rca.py | 19 +- metric_learn/sdml.py | 23 +- test/metric_learn_test.py | 6 +- test/test_fit_transform.py | 6 +- test/test_sklearn_compat.py | 6 +- test/test_utils.py | 710 +++++++++++++++++++++++++++++---- test/test_weakly_supervised.py | 283 +++++++------ 17 files changed, 1344 insertions(+), 307 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 74914ab3..f547c9c4 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -1,7 +1,8 @@ import numpy as np import six -from sklearn.utils import check_array - +from sklearn.utils import check_array, column_or_1d +from sklearn.utils.validation import (_assert_all_finite, + check_consistent_length) # hack around lack of axis kwarg in older numpy versions try: @@ -20,6 +21,17 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", warn_on_dtype=False, estimator=None): """Check that `tuples` is a valid array of tuples. + Depending on whether a preprocessor is available or not, `tuples` should be: + - a 3D array of formed tuples or a 2D array of tuples of indicators if a + preprocessor is available + - a 3D array of formed tuples if no preprocessor is available + + The number of elements in a tuple (e.g. 2 for pairs) should be the right + one, specified by the parameter `t`. + `check_tuples` will then convert the tuples to the right format as + `sklearn.utils.validation.check_array` would do. See + `sklearn.utils.validation.check_array` for more details. + Parameters ---------- tuples : object @@ -45,17 +57,17 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. This parameter - does not influence whether y can have np.inf or np.nan values. - The possibilities are: + Whether to raise an error on np.inf and np.nan in `tuples`. + This parameter does not influence whether y can have np.inf or np.nan + values. The possibilities are: - - True: Force all values of X to be finite. - - False: accept both np.inf and np.nan in X. - - 'allow-nan': accept only np.nan values in X. Values cannot be - infinite. + - True: Force all values of `tuples` to be finite. + - False: accept both np.inf and np.nan in `tuples`. + - 'allow-nan': accept only np.nan values in `tuples`. Values + cannot be infinite. ensure_min_samples : int (default=1) - Make sure that X has a minimum number of samples in its first + Make sure that `tuples` has a minimum number of samples in its first axis (rows for a 2D array). ensure_min_features : int (default=1) @@ -98,7 +110,7 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", if tuples.ndim == 2 and preprocessor: # in this case there is left to check # if t is OK check_t(tuples, t, context) - elif tuples.ndim == 3 and not preprocessor: + elif tuples.ndim == 3: # if the dimension is 3 we still have to check that the num_features is OK if ensure_min_features > 0: n_features = tuples.shape[2] @@ -110,16 +122,371 @@ def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", # then we should also check that t is OK check_t(tuples, t, context) else: - expected_shape = 2 if preprocessor else 3 - with_prep = (' when using {} preprocessor'.format('a' if preprocessor - else 'no')) - raise ValueError("{}D array expected{}{}. Found {}D array " - "instead:\ninput={}.\n" + expected_shape = ('2D array of indicators or 3D array of formed tuples' + if preprocessor else '3D array of formed tuples') + with_prep = ' when using a preprocessor' if preprocessor else '' + should_use_prep = ' and/or use a preprocessor' if not preprocessor else '' + raise ValueError("{} expected{}{}. Found {}D array " + "instead:\ninput={}. Reshape your data{}.\n" .format(expected_shape, context, with_prep, - tuples.ndim, tuples)) + tuples.ndim, tuples, should_use_prep)) return tuples +def check_tuples_y(tuples, y, preprocessor=False, t=None, dtype="auto", + order=None, copy=False, force_all_finite=True, + multi_output=False, ensure_min_samples=1, + ensure_min_features=1, y_numeric=False, + warn_on_dtype=False, estimator=None): + """Input validation for standard estimators. + + Adapted from `sklearn.utils.validation.check_X_y`. + Checks tuples with `check_tuples`, and checks that the size of `y` and + `tuples` are consistent. In addition, standard input checks are only + applied to y, such as checking that y does not have np.nan or np.inf + targets. For multi-label y, set multi_output=True to allow 2d and sparse y. + + Parameters + ---------- + tuples : 3D array of formed tuples or 2D array of tuples indicators + Input tuples. + + y : nd-array, list or sparse matrix + Labels. + + preprocessor : boolean + Whether a preprocessor is available or not (the input format depends + on that) (See `check_tuples` for more information) + + dtype : string, type, list of types or None (default="numeric") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean (default=True) + Whether to raise an error on np.inf and np.nan in X. This parameter + does not influence whether y can have np.inf or np.nan values. + + multi_output : boolean (default=False) + Whether to allow 2-d y (array or sparse matrix). If false, y will be + validated as a vector. y cannot have np.nan or np.inf values if + multi_output=True. + + ensure_min_samples : int (default=1) + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the tuples has some minimum number of features + The default value of 1 rejects empty datasets. + Setting to 0 disables this check. + + y_numeric : boolean (default=False) + Whether to ensure that y has a numeric type. If dtype of y is object, + it is converted to float64. Should only be used for regression + algorithms. + + warn_on_dtype : boolean (default=False) + Raise DataConversionWarning if the dtype of the input data structure + does not match the requested dtype, causing a memory copy. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. + + Returns + ------- + tuples_converted : object + The converted and validated tuples. + + y_converted : object + The converted and validated y. + """ + tuples = check_tuples(tuples, + preprocessor=preprocessor, + t=t, + dtype=dtype, + order=order, copy=copy, + force_all_finite=force_all_finite, + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, + warn_on_dtype=warn_on_dtype, + estimator=estimator) + if multi_output: + y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, + dtype=None) + else: + y = column_or_1d(y, warn=True) + _assert_all_finite(y) + if y_numeric and y.dtype.kind == 'O': + y = y.astype(np.float64) + + check_consistent_length(tuples, y) + + return tuples, y + + +def check_points(points, preprocessor=False, accept_sparse=False, + dtype="auto", order=None, copy=False, force_all_finite=True, + ensure_min_samples=1, ensure_min_features=1, + warn_on_dtype=False, estimator=None): + """Checks that `points` is a valid dataset of points + + Depending on whether a preprocessor is available or not, `points` should + be: + - a 2D array of formed points or a 1D array of indicators if a + preprocessor is available + - a 3D array of formed tuples if no preprocessor is available + + `check_points` will then convert the points to the right format as + `sklearn.utils.validation.check_array` would do. See + `sklearn.utils.validation.check_array` for more details. + + Parameters + ---------- + points : object + Input object to check / convert. + + accept_sparse : string, boolean or list/tuple of strings (default=False) + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + dtype : string, type, list of types or None (default="numeric") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless points.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + When order is None (default), then if copy=False, nothing is ensured + about the memory layout of the output array; otherwise (copy=True) + the memory layout of the returned array is kept as close as possible + to the original array. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean (default=True) + Whether to raise an error on np.inf and np.nan in `points`. + + ensure_min_samples : int (default=1) + Make sure that the array has a minimum number of samples in its first + axis (rows for a 2D array). Setting to 0 disables this check. + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when the input data has effectively 2 + dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 + disables this check. + + warn_on_dtype : boolean (default=False) + Raise DataConversionWarning if the dtype of the input data structure + does not match the requested dtype, causing a memory copy. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. + + Returns + ------- + points_converted : object + The converted and validated array of points. + + """ + if dtype == "auto": + dtype = 'numeric' if preprocessor is not None else None + + context = make_context(estimator) + points = check_array(points, dtype=dtype, accept_sparse=accept_sparse, + copy=copy, + force_all_finite=force_all_finite, + order=order, + ensure_2d=False, # input can be 1D + allow_nd=True, # true, to throw custom error message + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, + estimator=estimator, + warn_on_dtype=warn_on_dtype) + if (points.ndim == 1 and preprocessor) or points.ndim == 2: + return points + else: + expected_shape = ('1D array of indicators or 2D array of formed points' + if preprocessor else '2D array of formed points') + with_prep = ' when using a preprocessor' if preprocessor else '' + should_use_prep = ' and/or use a preprocessor' if not preprocessor else '' + raise ValueError("{} expected{}{}. Found {}D array " + "instead:\ninput={}. Reshape your data{}.\n" + .format(expected_shape, context, with_prep, + points.ndim, points, should_use_prep)) + return points + + +def check_points_y(points, y, preprocessor=False, accept_sparse=False, + dtype="auto", + order=None, copy=False, force_all_finite=True, + multi_output=False, ensure_min_samples=1, + ensure_min_features=1, y_numeric=False, + warn_on_dtype=False, estimator=None): + """Input validation for standard estimators. + + Checks `points` and `y` for consistent length, enforces `points` is a 2d + array of formed points or 1d array of indicators of points, and y is 1d. + Standard input checks are only applied to y, such as checking that y + does not have np.nan or np.inf targets. For multi-label y, set + multi_output=True to allow 2d and sparse y. If the dtype of `points` is + object, attempt converting to float, raising on failure. + Adapted from :func:`sklearn.utils.validation.check_X_y`. + + Parameters + ---------- + points : nd-array, list or sparse matrix + Input data. + + y : nd-array, list or sparse matrix + Labels. + + accept_sparse : string, boolean or list of string (default=False) + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + .. deprecated:: 0.19 + Passing 'None' to parameter ``accept_sparse`` in methods is + deprecated in version 0.19 "and will be removed in 0.21. Use + ``accept_sparse=False`` instead. + + dtype : string, type, list of types or None (default="numeric") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean (default=True) + Whether to raise an error on np.inf and np.nan in `points`. + This parameter does not influence whether y can have np.inf or np.nan + values. + + ensure_2d : boolean (default=True) + Whether to make `points` at least 2d. + + allow_nd : boolean (default=False) + Whether to allow points.ndim > 2. + + multi_output : boolean (default=False) + Whether to allow 2-d y (array or sparse matrix). If false, y will be + validated as a vector. y cannot have np.nan or np.inf values if + multi_output=True. + + ensure_min_samples : int (default=1) + Make sure that `points` has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when `points` has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. + + y_numeric : boolean (default=False) + Whether to ensure that y has a numeric type. If dtype of y is object, + it is converted to float64. Should only be used for regression + algorithms. + + warn_on_dtype : boolean (default=False) + Raise DataConversionWarning if the dtype of the input data structure + does not match the requested dtype, causing a memory copy. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. + + Returns + ------- + points_converted : object + The converted and validated points. + + y_converted : object + The converted and validated y. + """ + points = check_points(points, preprocessor=preprocessor, + accept_sparse=accept_sparse, + dtype=dtype, + order=order, copy=copy, + force_all_finite=force_all_finite, + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, + warn_on_dtype=warn_on_dtype, + estimator=estimator) + if multi_output: + y = check_points(y, 'csr', force_all_finite=True, dtype=None, + preprocessor=preprocessor) + else: + y = column_or_1d(y, warn=True) + _assert_all_finite(y) + if y_numeric and y.dtype.kind == 'O': + y = y.astype(np.float64) + + check_consistent_length(points, y) + + return points, y + + +def preprocess_tuples(tuples, preprocessor, estimator=None): + """form tuples if there is a preprocessor else keep them as such (assumes + that check_tuples has already been called)""" + if estimator is not None: + estimator_name = make_name(estimator) + (' after the preprocessor ' + 'has been applied') + else: + estimator_name = ('objects that will use preprocessed tuples') + + if preprocessor is not None and tuples.ndim == 2: + print("Preprocessing tuples...") + tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for + i in range(tuples.shape[1])]) + tuples = check_tuples(tuples, preprocessor=False, estimator=estimator_name) + # normally we shouldn't need to enforce the t, since a preprocessor shouldn't + # be able to transform a t tuples array into a t' tuples array + return tuples + + +def preprocess_points(points, preprocessor, estimator=None): + """form points if there is a preprocessor else keep them as such (assumes + that check_points has already been called)""" + if estimator is not None: + estimator_name = make_name(estimator) + (' after the preprocessor ' + 'has been applied') + else: + estimator_name = ('objects that will use preprocessed points') + + if preprocessor is not None and points.ndim == 1: + print("Preprocessing points...") + points = preprocessor(points) + points = check_points(points, preprocessor=False, estimator=estimator_name) + return points + + def make_context(estimator): """Helper function to create a string with the estimator name. Taken from check_array function in scikit-learn. @@ -128,6 +495,15 @@ def make_context(estimator): 'NCA': ' by NCA' None: '' """ + estimator_name = make_name(estimator) + context = (' by ' + estimator_name) if estimator_name is not None else '' + return context + + +def make_name(estimator): + """Helper function that returns the name of estimator or the given string + if a string is given + """ if estimator is not None: if isinstance(estimator, six.string_types): estimator_name = estimator @@ -135,8 +511,7 @@ def make_context(estimator): estimator_name = estimator.__class__.__name__ else: estimator_name = None - context = ' by ' + estimator_name if estimator_name is not None else '' - return context + return estimator_name def check_t(tuples, t, context): diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 17cb66a0..734fe2b8 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -1,11 +1,12 @@ from numpy.linalg import cholesky from sklearn.base import BaseEstimator -from sklearn.utils.validation import check_array, _is_arraylike +from sklearn.utils.validation import _is_arraylike from sklearn.metrics import roc_auc_score import numpy as np from abc import ABCMeta, abstractmethod import six -from ._util import check_tuples, ArrayIndexer +from ._util import (check_tuples, ArrayIndexer, preprocess_tuples, + check_points, preprocess_points) class BaseMetricLearner(six.with_metaclass(ABCMeta, BaseEstimator)): @@ -43,16 +44,6 @@ def check_preprocessor(self): else: self.preprocessor_ = self.preprocessor - def preprocess_tuples(self, tuples): - if self.preprocessor is not None: - return np.apply_along_axis(self.preprocessor_, 1, tuples) - else: - return tuples - - def _check_tuples(self, tuples, t=None): - return check_tuples(tuples, preprocessor=self.preprocessor_ is not None, - estimator=self.__class__.__name__, t=t) - class MetricTransformer(six.with_metaclass(ABCMeta)): @@ -115,9 +106,10 @@ def score_pairs(self, pairs): scores: `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. """ - pairs = check_tuples(pairs, preprocessor=self.preprocessor_ is not None) - # TODO: add a check (and a test) to only be able to score if t is OK - pairs = self.preprocess_tuples(pairs) + pairs = check_tuples(pairs, preprocessor=self.preprocessor is not None, + estimator=self, t=2) + pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, + estimator=self) pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) @@ -140,7 +132,11 @@ def transform(self, X): X_embedded : `numpy.ndarray`, shape=(n_samples, num_dims) The embedded data points. """ - X_checked = check_array(X, accept_sparse=True) + X_checked = check_points(X, estimator=self, + preprocessor=self.preprocessor is not None, + accept_sparse=True) + X_checked = preprocess_points(X_checked, preprocessor=self.preprocessor_, + estimator=self) return X_checked.dot(self.transformer_.T) def metric(self): @@ -173,15 +169,7 @@ def transformer_from_metric(self, metric): return V.T * np.sqrt(np.maximum(0, w[:, None])) -class _WeaklySupervisedMixin(BaseMetricLearner): - - _t = None # number of points in a tuple, None by default - - def _check_tuples(self, tuples): - return super(_WeaklySupervisedMixin, self)._check_tuples(tuples, t=self._t) - - -class _PairsClassifierMixin(_WeaklySupervisedMixin): +class _PairsClassifierMixin(BaseMetricLearner): _t = 2 # number of points in a tuple, 2 for pairs @@ -203,11 +191,14 @@ def predict(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted learned metric value between samples in every pair. """ - pairs = self._check_tuples(pairs) + pairs = check_tuples(pairs, preprocessor=self.preprocessor is not None, + estimator=self, t=self._t) + # no need to preprocess_tuples since it is done in score_pairs return self.score_pairs(pairs) def decision_function(self, pairs): - pairs = self._check_tuples(pairs) + # no need to check_tuples and preprocess_tuples since it is done in + # predict->score_pairs return self.predict(pairs) def score(self, pairs, y): @@ -235,11 +226,12 @@ def score(self, pairs, y): score : float The ``roc_auc`` score. """ - pairs = self._check_tuples(pairs) + # no need to check_tuples and preprocess_tuples since it is done in + # predict->score_pairs return roc_auc_score(y, self.decision_function(pairs)) -class _QuadrupletsClassifierMixin(_WeaklySupervisedMixin): +class _QuadrupletsClassifierMixin(BaseMetricLearner): _t = 4 # number of points in a tuple, 4 for quadruplets @@ -262,13 +254,18 @@ def predict(self, quadruplets): prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each quadruplet. """ - quadruplets = self._check_tuples(quadruplets) + quadruplets = check_tuples(quadruplets, + preprocessor=self.preprocessor is not None, + estimator=self, t=self._t) + # no need to preprocess_tuples since it is done in score_pairs # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices return (self.score_pairs(quadruplets[:, :2, ...]) - self.score_pairs(quadruplets[:, 2:, ...])) def decision_function(self, quadruplets): + # no need to check_tuples and preprocess_tuples since it is done in + # predict->score_pairs return self.predict(quadruplets) def score(self, quadruplets, y=None): @@ -293,5 +290,6 @@ def score(self, quadruplets, y=None): score : float The quadruplets score. """ - quadruplets = self._check_tuples(quadruplets) + # no need to check_tuples and preprocess_tuples since it is done in + # predict->score_pairs return -np.mean(self.predict(quadruplets)) diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 88d0838d..ac42161e 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -13,6 +13,7 @@ from sklearn.utils.validation import check_array from sklearn.base import TransformerMixin +from metric_learn._util import preprocess_points, check_points from .base_metric import MahalanobisMixin @@ -35,8 +36,10 @@ def fit(self, X, y=None): y : unused """ self.check_preprocessor() - - self.X_ = check_array(X, ensure_min_samples=2) + self.X_ = check_points(X, ensure_min_samples=2, estimator=self, + preprocessor=self.preprocessor is not None) + self.X_ = preprocess_points(self.X_, preprocessor=self.preprocessor_, + estimator=self) self.M_ = np.cov(self.X_, rowvar = False) if self.M_.ndim == 0: self.M_ = 1./self.M_ diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 4ff6799c..7d6688db 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -17,15 +17,19 @@ import numpy as np from six.moves import xrange from sklearn.metrics import pairwise_distances -from sklearn.utils.validation import check_array, check_X_y +from sklearn.utils.validation import check_array from sklearn.base import TransformerMixin from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import vector_norm, check_tuples +from ._util import (vector_norm, check_points_y, check_tuples_y, + preprocess_tuples, preprocess_points) class _BaseITML(MahalanobisMixin): """Information Theoretic Metric Learning (ITML)""" + + _t = 2 # constraints are pairs + def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, A0=None, verbose=False, preprocessor=None): """Initialize ITML. @@ -57,14 +61,11 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, super(_BaseITML, self).__init__(preprocessor) def _process_pairs(self, pairs, y, bounds): - # for now we check_X_y and check_tuples but we should only - # check_tuples_y in the future - pairs, y = check_X_y(pairs, y, accept_sparse=False, - ensure_2d=False, allow_nd=True) self.check_preprocessor() - pairs = self._check_tuples(pairs) - # pairs classifiers and for quadruplets classifiers - pairs = self.preprocess_tuples(pairs) + pairs, y = check_tuples_y(pairs, y, estimator=self, t=self._t, + preprocessor=self.preprocessor is not None) + pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, + estimator=self) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -231,7 +232,11 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - X, y = check_X_y(X, y) + self.check_preprocessor() + X, y = check_points_y(X, y, preprocessor=self.preprocessor is not None, + estimator=self) + X = preprocess_points(X, preprocessor=self.preprocessor_, + estimator=self) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index cc7daa24..18ffa057 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -16,8 +16,9 @@ import warnings from six.moves import xrange from sklearn.metrics import pairwise_distances -from sklearn.utils.validation import check_X_y from sklearn.base import TransformerMixin + +from metric_learn._util import check_points_y, preprocess_points from .base_metric import MahalanobisMixin @@ -61,7 +62,11 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted', def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) - self.X_, y = check_X_y(X, y) + self.check_preprocessor() + self.X_, y = check_points_y(X, y, estimator=self, + preprocessor=self.preprocessor is not None) + self.X_ = preprocess_points(self.X_, estimator=self, + preprocessor=self.preprocessor_) n, d = self.X_.shape num_classes = len(unique_classes) @@ -93,7 +98,6 @@ def fit(self, X, y): y : (n,) array-like Class labels, one per point of data. ''' - self.check_preprocessor() X, y, num_classes, n, d, dim, k_ = self._process_inputs(X, y) tSb = np.zeros((d,d)) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index f74c7469..470685b5 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -14,9 +14,9 @@ import warnings from collections import Counter from six.moves import xrange -from sklearn.utils.validation import check_X_y, check_array from sklearn.metrics import euclidean_distances from sklearn.base import TransformerMixin +from metric_learn._util import preprocess_points, check_points_y from .base_metric import MahalanobisMixin @@ -50,7 +50,15 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, class python_LMNN(_base_LMNN): def _process_inputs(self, X, labels): - self.X_ = check_array(X, dtype=float) + self.check_preprocessor() + self.X_, labels = check_points_y(X, labels, + preprocessor=self.preprocessor is not + None, + estimator=self) + self.X_ = preprocess_points(self.X_, estimator=self, + preprocessor=self.preprocessor_) + self.X_ = self.X_.astype(float) # todo: remove the conversion here and + # integrate it into check_points_y num_pts, num_dims = self.X_.shape unique_labels, self.label_inds_ = np.unique(labels, return_inverse=True) if len(self.label_inds_) != num_pts: @@ -65,7 +73,6 @@ def _process_inputs(self, X, labels): ' (smallest class has %d)' % required_k) def fit(self, X, y): - self.check_preprocessor() k = self.k reg = self.regularization @@ -255,7 +262,13 @@ class LMNN(_base_LMNN): """ def fit(self, X, y): - self.X_, y = check_X_y(X, y, dtype=float) + self.check_preprocessor() + self.X_, y = check_points_y(X, y, estimator=self, + preprocessor=self.preprocessor is not None) + self.X_ = self.X_.astype(float) # todo: remove the conversion here and + # integrate it into check_points_y + self.X_ = preprocess_points(self.X_, estimator=self, + preprocessor=self.preprocessor_) labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 3c14e504..536284d1 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -12,14 +12,17 @@ import scipy.linalg from six.moves import xrange from sklearn.base import TransformerMixin -from sklearn.utils.validation import check_array, check_X_y -from ._util import check_tuples +from ._util import (check_tuples, check_points_y, preprocess_points, + preprocess_tuples) from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin from .constraints import Constraints class _BaseLSML(MahalanobisMixin): + + _t = 4 # constraints are quadruplets + def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, preprocessor=None): """Initialize LSML. @@ -45,11 +48,11 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, def _prepare_quadruplets(self, quadruplets, weights): # for now we check_array and check_tuples but we should only # check_tuples in the future (with enhanced check_tuples) - quadruplets = check_array(quadruplets, accept_sparse=False, - ensure_2d=False, allow_nd=True) self.check_preprocessor() - quadruplets = self._check_tuples(quadruplets) - quadruplets = self.preprocess_tuples(quadruplets) + quadruplets = check_tuples(quadruplets, estimator=self, t=self._t, + preprocessor=self.preprocessor is not None) + quadruplets = preprocess_tuples(quadruplets, estimator=self, + preprocessor=self.preprocessor_) # check to make sure that no two constrained vectors are identical self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :] @@ -219,7 +222,10 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - X, y = check_X_y(X, y) + self.check_preprocessor() + X, y = check_points_y(X, y, preprocessor=self.preprocessor is not None, + estimator=self) + X = preprocess_points(X, estimator=self, preprocessor=self.preprocessor_) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 1a752365..68dfc285 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -13,8 +13,8 @@ from sklearn.base import TransformerMixin from sklearn.decomposition import PCA -from sklearn.utils.validation import check_X_y +from metric_learn._util import check_points_y, preprocess_points from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps @@ -59,7 +59,11 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, super(MLKR, self).__init__(preprocessor) def _process_inputs(self, X, y): - self.X_, y = check_X_y(X, y) + self.check_preprocessor() + self.X_, y = check_points_y(X, y, y_numeric=True, estimator=self, + preprocessor=self.preprocessor is not None) + self.X_ = preprocess_points(self.X_, estimator=self, + preprocessor=self.preprocessor_) n, d = self.X_.shape if y.shape[0] != n: raise ValueError('Data and label lengths mismatch: %d != %d' @@ -72,7 +76,7 @@ def _process_inputs(self, X, y): if A is None: # initialize to PCA transformation matrix # note: not the same as n_components=m ! - A = PCA().fit(X).components_.T[:m] + A = PCA().fit(self.X_).components_.T[:m] elif A.shape != (m, d): raise ValueError('A0 needs shape (%d,%d) but got %s' % ( m, d, A.shape)) @@ -87,7 +91,6 @@ def fit(self, X, y): X : (n x d) array of samples y : (n) data labels """ - self.check_preprocessor() X, y, A = self._process_inputs(X, y) diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 2deba45a..af87d5fe 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -20,15 +20,19 @@ import numpy as np from six.moves import xrange from sklearn.base import TransformerMixin -from sklearn.utils.validation import check_array, check_X_y +from sklearn.utils.validation import check_array from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import vector_norm, check_tuples +from ._util import (vector_norm, check_points_y, preprocess_points, + check_tuples_y, preprocess_tuples) class _BaseMMC(MahalanobisMixin): """Mahalanobis Metric for Clustering (MMC)""" + + _t = 2 # constraints are pairs + def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, A0=None, diagonal=False, diagonal_c=1.0, verbose=False, preprocessor=None): @@ -71,12 +75,10 @@ def _fit(self, pairs, y): return self._fit_full(pairs, y) def _process_pairs(self, pairs, y): - # for now we check_X_y and check_tuples but we should only - # check_tuples_y in the future - pairs, y = check_X_y(pairs, y, accept_sparse=False, - ensure_2d=False, allow_nd=True) - pairs = self._check_tuples(pairs) - pairs = self.preprocess_tuples(pairs) + pairs, y = check_tuples_y(pairs, y, estimator=self, t=self._t, + preprocessor=self.preprocessor is not None) + pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, + estimator=self) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -452,7 +454,10 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - X, y = check_X_y(X, y) + self.check_preprocessor() + X, y = check_points_y(X, y, preprocessor=self.preprocessor is not None, + estimator=self) + X = preprocess_points(X, preprocessor=self.preprocessor_, estimator=self) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/nca.py b/metric_learn/nca.py index e50497d5..42d6935e 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -7,8 +7,8 @@ import numpy as np from six.moves import xrange from sklearn.base import TransformerMixin -from sklearn.utils.validation import check_X_y +from metric_learn._util import check_points_y, preprocess_points from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps @@ -30,7 +30,6 @@ def __init__(self, num_dims=None, max_iter=100, learning_rate=0.01, self.learning_rate = learning_rate super(NCA, self).__init__(preprocessor) - def fit(self, X, y): """ X: data matrix, (n x d) @@ -38,7 +37,10 @@ def fit(self, X, y): """ self.check_preprocessor() - X, labels = check_X_y(X, y) + X, labels = check_points_y(X, y, estimator=self, + preprocessor=self.preprocessor is not None) + X = preprocess_points(X, estimator=self, + preprocessor=self.preprocessor_) n, d = X.shape num_dims = self.num_dims if num_dims is None: diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 34a368fd..2d5e8ea5 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -17,8 +17,8 @@ from six.moves import xrange from sklearn import decomposition from sklearn.base import TransformerMixin -from sklearn.utils.validation import check_array +from metric_learn._util import check_points_y, check_points, preprocess_points from .base_metric import MahalanobisMixin from .constraints import Constraints @@ -65,7 +65,10 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): super(RCA, self).__init__(preprocessor) def _process_data(self, X): - self.X_ = X = check_array(X) + self.check_preprocessor() + X = check_points(X, preprocessor=self.preprocessor is not None, + estimator=self) + X = preprocess_points(X, preprocessor=self.preprocessor_, estimator=self) # PCA projection to remove noise and redundant information. if self.pca_comps is not None: @@ -78,8 +81,8 @@ def _process_data(self, X): return X, M_pca - def _check_dimension(self, rank): - d = self.X_.shape[1] + def _check_dimension(self, rank, X): + d = X.shape[1] if rank < d: warnings.warn('The inner covariance matrix is not invertible, ' 'so the transformation matrix may contain Nan values. ' @@ -109,7 +112,6 @@ def fit(self, data, chunks): When ``chunks[i] == -1``, point i doesn't belong to any chunklet. When ``chunks[i] == j``, point i belongs to chunklet j. """ - self.check_preprocessor() data, M_pca = self._process_data(data) @@ -117,7 +119,7 @@ def fit(self, data, chunks): chunk_mask, chunked_data = _chunk_mean_centering(data, chunks) inner_cov = np.cov(chunked_data, rowvar=0, bias=1) - dim = self._check_dimension(np.linalg.matrix_rank(inner_cov)) + dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), data) # Fisher Linear Discriminant projection if dim < data.shape[1]: @@ -179,6 +181,11 @@ def fit(self, X, y, random_state=np.random): y : (n) data labels random_state : a random.seed object to fix the random_state if needed. """ + self.check_preprocessor() + X, y = check_points_y(X, y, estimator=self, + preprocessor=self.preprocessor is not None) + X = preprocess_points(X, preprocessor=self.preprocessor_, + estimator=self) chunks = Constraints(y).chunks(num_chunks=self.num_chunks, chunk_size=self.chunk_size, random_state=random_state) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index b89760e6..7acfa12d 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -13,14 +13,18 @@ from sklearn.base import TransformerMixin from sklearn.covariance import graph_lasso from sklearn.utils.extmath import pinvh -from sklearn.utils.validation import check_array, check_X_y +from metric_learn._util import (preprocess_points, preprocess_tuples, + check_tuples_y) from .base_metric import MahalanobisMixin, _PairsClassifierMixin from .constraints import Constraints, wrap_pairs -from ._util import check_tuples +from ._util import check_points_y class _BaseSDML(MahalanobisMixin): + + _t = 2 # constraints are pairs + def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, verbose=False, preprocessor=None): """ @@ -49,13 +53,11 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, super(_BaseSDML, self).__init__(preprocessor) def _prepare_pairs(self, pairs, y): - # for now we check_X_y and check_tuples but we should only - # check_tuples_y in the future - pairs, y = check_X_y(pairs, y, accept_sparse=False, - ensure_2d=False, allow_nd=True) self.check_preprocessor() - pairs = self._check_tuples(pairs) - pairs = self.preprocess_tuples(pairs) + pairs, y = check_tuples_y(pairs, y, estimator=self, t=self._t, + preprocessor=self.preprocessor is not None) + pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, + estimator=self) # set up prior M if self.use_cov: @@ -162,7 +164,10 @@ def fit(self, X, y, random_state=np.random): self : object Returns the instance. """ - y = check_array(y, ensure_2d=False) + self.check_preprocessor() + X, y = check_points_y(X, y, estimator=self, + preprocessor=self.preprocessor is not None) + X = preprocess_points(X, estimator=self, preprocessor=self.preprocessor_) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 1671c8ef..aad076a0 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -5,9 +5,9 @@ from sklearn.datasets import load_iris from numpy.testing import assert_array_almost_equal -from metric_learn import ( - LMNN, NCA, LFDA, Covariance, MLKR, MMC, - LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised) +from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, + LSML_Supervised, ITML_Supervised, SDML_Supervised, + RCA_Supervised, MMC_Supervised) # Import this specially for testing. from metric_learn.constraints import wrap_pairs from metric_learn.lmnn import python_LMNN diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py index f898a0fe..41a7dfd2 100644 --- a/test/test_fit_transform.py +++ b/test/test_fit_transform.py @@ -3,9 +3,9 @@ from sklearn.datasets import load_iris from numpy.testing import assert_array_almost_equal -from metric_learn import ( - LMNN, NCA, LFDA, Covariance, MLKR, - LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised) +from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, + LSML_Supervised, ITML_Supervised, SDML_Supervised, + RCA_Supervised, MMC_Supervised) class TestFitTransform(unittest.TestCase): diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index f1e1a09d..cf857b25 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -2,9 +2,9 @@ import unittest from sklearn.utils.estimator_checks import check_estimator -from metric_learn import ( - LMNN, NCA, LFDA, Covariance, MLKR, - LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised) +from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, + LSML_Supervised, ITML_Supervised, SDML_Supervised, + RCA_Supervised, MMC_Supervised) # Wrap the _Supervised methods with a deterministic wrapper for testing. diff --git a/test/test_utils.py b/test/test_utils.py index a5fb4644..84c224f5 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,147 +1,175 @@ import pytest +from collections import namedtuple import numpy as np +from sklearn.cross_validation import train_test_split from sklearn.exceptions import DataConversionWarning -from metric_learn import NCA -from metric_learn._util import check_tuples, make_context +from sklearn.utils import check_random_state, shuffle +from sklearn.utils.testing import set_random_state +from sklearn.base import clone +from metric_learn._util import (check_tuples, make_context, check_points, + preprocess_tuples, make_name, + preprocess_points) +from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, + LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, + MMC_Supervised, RCA_Supervised, SDML_Supervised) +from sklearn.datasets import make_regression, make_blobs +# ---------------------------- test check_tuples ---------------------------- + @pytest.fixture -def X_prep(): - """Basic array for testing when using a preprocessor""" - X = np.array([[1, 2], - [2, 3]]) - return X +def tuples_prep(): + """Basic array for testing when using a preprocessor""" + tuples = np.array([[1, 2], + [2, 3]]) + return tuples @pytest.fixture -def X_no_prep(): - """Basic array for testing when using no preprocessor""" - X = np.array([[[1., 2.3], [2.3, 5.3]], - [[2.3, 4.3], [0.2, 0.4]]]) - return X +def tuples_no_prep(): + """Basic array for testing when using no preprocessor""" + tuples = np.array([[[1., 2.3], [2.3, 5.3]], + [[2.3, 4.3], [0.2, 0.4]]]) + return tuples @pytest.mark.parametrize('estimator, expected', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) def test_make_context(estimator, expected): - """test the make_name function""" - assert make_context(estimator) == expected + """test the make_name function""" + assert make_context(estimator) == expected + + +@pytest.mark.parametrize('estimator, expected', + [(NCA(), "NCA"), ('NCA', "NCA"), (None, None)]) +def test_make_name(estimator, expected): + """test the make_name function""" + assert make_name(estimator) == expected @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -@pytest.mark.parametrize('load_X, preprocessor', - [(X_prep, True), (X_no_prep, False)]) -def test_check_tuples_invalid_t(estimator, context, load_X, preprocessor): +@pytest.mark.parametrize('load_tuples, preprocessor', + [(tuples_prep, True), (tuples_no_prep, False), + (tuples_no_prep, True)]) +def test_check_tuples_invalid_t(estimator, context, load_tuples, preprocessor): """Checks that the exception are raised if t is not the one expected""" - X = load_X() + tuples = load_tuples() expected_msg = ("Tuples of 3 element(s) expected{}. Got tuples of 2 " "element(s) instead (shape={}):\ninput={}.\n" - .format(context, X.shape, X)) + .format(context, tuples.shape, tuples)) with pytest.raises(ValueError) as raised_error: - check_tuples(X, t=3, preprocessor=preprocessor, estimator=estimator) + check_tuples(tuples, t=3, preprocessor=preprocessor, estimator=estimator) assert str(raised_error.value) == expected_msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -@pytest.mark.parametrize('X, found, expected, preprocessor', - [(5, '0', '2', True), - (5, '0', '3', False), - ([1, 2], '1', '2', True), - ([1, 2], '1', '3', False), - ([[[[5]]]], '4', '2', True), - ([[[[5]]]], '4', '3', False), - ([[1], [3]], '2', '3', False), - ([[[1], [3]]], '3', '2', True)]) -def test_check_tuples_invalid_shape(estimator, context, X, found, expected, - preprocessor): +@pytest.mark.parametrize('tuples, found, expected, preprocessor', + [(5, '0', '2D array of indicators or 3D array of ' + 'formed tuples', True), + (5, '0', '3D array of formed tuples', False), + ([1, 2], '1', '2D array of indicators or 3D array ' + 'of formed tuples', True), + ([1, 2], '1', '3D array of formed tuples', False), + ([[[[5]]]], '4', '2D array of indicators or 3D array' + ' of formed tuples', True), + ([[[[5]]]], '4', '3D array of formed tuples', False), + ([[1], [3]], '2', '3D array of formed ' + 'tuples', False)]) +def test_check_tuples_invalid_shape(estimator, context, tuples, found, + expected, preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (not 2D with preprocessor or 3D with no preprocessor) """ - X = np.array(X) - msg = ("{}D array expected{} when using {} preprocessor. Found {}D array " - "instead:\ninput={}.\n" - .format(expected, context, 'a' if preprocessor else 'no', found, X)) + tuples = np.array(tuples) + msg = ("{} expected{}{}. Found {}D array instead:\ninput={}. Reshape your " + "data{}.\n" + .format(expected, context, ' when using a preprocessor' + if preprocessor else '', found, tuples, + ' and/or use a preprocessor' if not preprocessor else '')) with pytest.raises(ValueError) as raised_error: - check_tuples(X, preprocessor=preprocessor, ensure_min_samples=0, + check_tuples(tuples, preprocessor=preprocessor, ensure_min_samples=0, estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -def test_check_tuples_invalid_n_features(estimator, context, X_no_prep): +def test_check_tuples_invalid_n_features(estimator, context, tuples_no_prep): """Checks that the right warning is printed if not enough features Here we only test if no preprocessor (otherwise we don't ensure this) """ msg = ("Found array with 2 feature(s) (shape={}) while" - " a minimum of 3 is required{}.".format(X_no_prep.shape, context)) + " a minimum of 3 is required{}.".format(tuples_no_prep.shape, + context)) with pytest.raises(ValueError) as raised_error: - check_tuples(X_no_prep, preprocessor=False, ensure_min_features=3, + check_tuples(tuples_no_prep, preprocessor=False, ensure_min_features=3, estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -@pytest.mark.parametrize('load_X, preprocessor', - [(X_prep, True), (X_no_prep, False)]) -def test_check_tuples_invalid_n_samples(estimator, context, load_X, +@pytest.mark.parametrize('load_tuples, preprocessor', + [(tuples_prep, True), (tuples_no_prep, False), + (tuples_no_prep, True)]) +def test_check_tuples_invalid_n_samples(estimator, context, load_tuples, preprocessor): """Checks that the right warning is printed if n_samples is too small""" - X = load_X() + tuples = load_tuples() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " - "is required{}.".format(X.shape, context)) + "is required{}.".format(tuples.shape, context)) with pytest.raises(ValueError) as raised_error: - check_tuples(X, preprocessor=preprocessor, ensure_min_samples=3, + check_tuples(tuples, preprocessor=preprocessor, ensure_min_samples=3, estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -@pytest.mark.parametrize('load_X, preprocessor', - [(X_prep, True), (X_no_prep, False)]) -def test_check_tuples_invalid_dtype_convertible(estimator, context, load_X, - preprocessor): +@pytest.mark.parametrize('load_tuples, preprocessor', + [(tuples_prep, True), (tuples_no_prep, False), + (tuples_no_prep, True)]) +def test_check_tuples_invalid_dtype_convertible(estimator, context, + load_tuples, preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" - X = load_X().astype(object) + tuples = load_tuples().astype(object) msg = ("Data with input dtype object was converted to float64{}." .format(context)) with pytest.warns(DataConversionWarning) as raised_warning: - check_tuples(X, preprocessor=preprocessor, dtype=np.float64, + check_tuples(tuples, preprocessor=preprocessor, dtype=np.float64, warn_on_dtype=True, estimator=estimator) assert str(raised_warning[0].message) == msg -@pytest.mark.parametrize('preprocessor, X', +@pytest.mark.parametrize('preprocessor, tuples', [(True, np.array([['a', 'b'], ['e', 'b']])), (False, np.array([[['b', 'v'], ['a', 'd']], [['x', 'u'], ['c', 'a']]]))]) -def test_check_tuples_invalid_dtype_not_convertible(preprocessor, X): +def test_check_tuples_invalid_dtype_not_convertible(preprocessor, tuples): """Checks that a value error is thrown if attempting to convert an input not convertible to float """ with pytest.raises(ValueError): - check_tuples(X, preprocessor=preprocessor, dtype=np.float64) + check_tuples(tuples, preprocessor=preprocessor, dtype=np.float64) @pytest.mark.parametrize('t', [2, None]) -def test_check_tuples_valid_t(t, X_prep, X_no_prep): +def test_check_tuples_valid_t(t, tuples_prep, tuples_no_prep): """For inputs that have the right matrix dimension (2D or 3D for instance), checks that checking the number of tuples (pairs, quadruplets, etc) raises no warning """ with pytest.warns(None) as record: - check_tuples(X_prep, preprocessor=True, t=t) - check_tuples(X_no_prep, preprocessor=False, t=t) + check_tuples(tuples_prep, preprocessor=True, t=t) + check_tuples(tuples_no_prep, preprocessor=False, t=t) assert len(record) == 0 -@pytest.mark.parametrize('X', +@pytest.mark.parametrize('tuples', [np.array([[2.5, 0.1, 2.6], [1.6, 4.8, 9.1]]), np.array([[2, 0, 2], @@ -153,16 +181,17 @@ def test_check_tuples_valid_t(t, X_prep, X_no_prep): [np.array([2, 0, 2]), np.array([1, 4, 9])], ((2, 0, 2), - (1, 4, 9)) - ]) -def test_check_tuples_valid_with_preprocessor(X): + (1, 4, 9)), + np.array([[[1.2, 2.2], [1.4, 3.3]], + [[2.6, 2.3], [3.4, 5.0]]])]) +def test_check_tuples_valid_with_preprocessor(tuples): """Test that valid inputs when using a preprocessor raises no warning""" with pytest.warns(None) as record: - check_tuples(X, preprocessor=True) + check_tuples(tuples, preprocessor=True) assert len(record) == 0 -@pytest.mark.parametrize('X', +@pytest.mark.parametrize('tuples', [np.array([[[2.5], [0.1], [2.6]], [[1.6], [4.8], [9.1]], [[5.6], [2.8], [6.1]]]), @@ -175,40 +204,567 @@ def test_check_tuples_valid_with_preprocessor(X): (((2, 1), (0, 2), (2, 3)), ((1, 2), (4, 4), (9, 3)), ((3, 1), (4, 4), (29, 4)))]) -def test_check_tuples_valid_without_preprocessor(X): +def test_check_tuples_valid_without_preprocessor(tuples): """Test that valid inputs when using no preprocessor raises no warning""" with pytest.warns(None) as record: - check_tuples(X, preprocessor=False) + check_tuples(tuples, preprocessor=False) assert len(record) == 0 -def test_check_tuples_behaviour_auto_dtype(X_no_prep): +def test_check_tuples_behaviour_auto_dtype(tuples_no_prep): """Checks that check_tuples allows by default every type if using a preprocessor, and numeric types if using no preprocessor""" - X_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']] + tuples_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']] with pytest.warns(None) as record: - check_tuples(X_prep, preprocessor=True) + check_tuples(tuples_prep, preprocessor=True) assert len(record) == 0 with pytest.warns(None) as record: - check_tuples(X_no_prep) # numeric type + check_tuples(tuples_no_prep) # numeric type assert len(record) == 0 # not numeric type - X_no_prep = np.array([[['img1.png'], ['img2.png']], - [['img3.png'], ['img5.png']]]) - X_no_prep = X_no_prep.astype(object) + tuples_no_prep = np.array([[['img1.png'], ['img2.png']], + [['img3.png'], ['img5.png']]]) + tuples_no_prep = tuples_no_prep.astype(object) with pytest.raises(ValueError): - check_tuples(X_no_prep) + check_tuples(tuples_no_prep) def test_check_tuples_invalid_complex_data(): """Checks that the right error message is thrown if given complex data ( this comes from sklearn's check_array's message)""" - X = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], - [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]]) + tuples = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], + [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]]) + msg = ("Complex data not supported\n" + "{}\n".format(tuples)) + with pytest.raises(ValueError) as raised_error: + check_tuples(tuples) + assert str(raised_error.value) == msg + + +# ---------------------------- test check_points ---------------------------- + + +@pytest.fixture +def points_prep(): + """Basic array for testing when using a preprocessor""" + points = np.array([1, 2]) + return points + + +@pytest.fixture +def points_no_prep(): + """Basic array for testing when using no preprocessor""" + points = np.array([[1., 2.3], + [2.3, 4.3]]) + return points + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('points, found, expected, preprocessor', + [(5, '0', '1D array of indicators or 2D array of ' + 'formed points', True), + (5, '0', '2D array of formed points', False), + ([1, 2], '1', '2D array of formed points', False), + ([[[5]]], '3', '1D array of indicators or 2D ' + 'array of formed points', True), + ([[[5]]], '3', '2D array of formed points', False)]) +def test_check_points_invalid_shape(estimator, context, points, found, + expected, preprocessor): + """Checks that a value error with the appropriate message is raised if + shape is invalid (valid being 1D or 2D with preprocessor or 2D with no + preprocessor) + """ + points = np.array(points) + msg = ("{} expected{}{}. Found {}D array instead:\ninput={}. Reshape your " + "data{}.\n" + .format(expected, context, ' when using a preprocessor' + if preprocessor else '', found, points, + ' and/or use a preprocessor' if not preprocessor else '')) + with pytest.raises(ValueError) as raised_error: + check_points(points, preprocessor=preprocessor, ensure_min_samples=0, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +def test_check_points_invalid_n_features(estimator, context, points_no_prep): + """Checks that the right warning is printed if not enough features + Here we only test if no preprocessor (otherwise we don't ensure this) + """ + msg = ("Found array with 2 feature(s) (shape={}) while" + " a minimum of 3 is required{}.".format(points_no_prep.shape, + context)) + with pytest.raises(ValueError) as raised_error: + check_points(points_no_prep, preprocessor=False, ensure_min_features=3, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('load_points, preprocessor', + [(points_prep, True), (points_no_prep, False), + (points_no_prep, True)]) +def test_check_points_invalid_n_samples(estimator, context, load_points, + preprocessor): + """Checks that the right warning is printed if n_samples is too small""" + points = load_points() + msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " + "is required{}.".format(points.shape, context)) + with pytest.raises(ValueError) as raised_error: + check_points(points, preprocessor=preprocessor, ensure_min_samples=3, + estimator=estimator) + assert str(raised_error.value) == msg + + +@pytest.mark.parametrize('estimator, context', + [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) +@pytest.mark.parametrize('load_points, preprocessor', + [(points_prep, True), (points_no_prep, False), + (points_no_prep, True)]) +def test_check_points_invalid_dtype_convertible(estimator, context, + load_points, preprocessor): + """Checks that a warning is raised if a convertible input is converted to + float""" + points = load_points().astype(object) + msg = ("Data with input dtype object was converted to float64{}." + .format(context)) + with pytest.warns(DataConversionWarning) as raised_warning: + check_points(points, preprocessor=preprocessor, dtype=np.float64, + warn_on_dtype=True, estimator=estimator) + assert str(raised_warning[0].message) == msg + + +@pytest.mark.parametrize('preprocessor, points', + [(True, np.array([['a', 'b'], + ['e', 'b']])), + (False, np.array([[['b', 'v'], ['a', 'd']], + [['x', 'u'], ['c', 'a']]]))]) +def test_check_points_invalid_dtype_not_convertible(preprocessor, points): + """Checks that a value error is thrown if attempting to convert an + input not convertible to float + """ + with pytest.raises(ValueError): + check_points(points, preprocessor=preprocessor, dtype=np.float64) + + +@pytest.mark.parametrize('points', + [["img1.png", "img3.png", "img2.png"], + np.array(["img1.png", "img3.png", "img2.png"]), + [2, 0, 2, 1, 4, 9], + range(10), + np.array([2, 0, 2]), + (2, 0, 2), + np.array([[1.2, 2.2], + [2.6, 2.3]])]) +def test_check_points_valid_with_preprocessor(points): + """Test that valid inputs when using a preprocessor raises no warning""" + with pytest.warns(None) as record: + check_points(points, preprocessor=True) + assert len(record) == 0 + + +@pytest.mark.parametrize('points', + [np.array([[2.5, 0.1, 2.6], + [1.6, 4.8, 9.1], + [5.6, 2.8, 6.1]]), + np.array([[2, 0, 2], + [1, 4, 9], + [1, 5, 3]]), + [[2, 0, 2], + [1, 4, 9], + [3, 4, 29]], + ((2, 1, 0, 2, 2, 3), + (1, 2, 4, 4, 9, 3), + (3, 1, 4, 4, 29, 4))]) +def test_check_points_valid_without_preprocessor(points): + """Test that valid inputs when using no preprocessor raises no warning""" + with pytest.warns(None) as record: + check_points(points, preprocessor=False) + assert len(record) == 0 + + +def test_check_points_behaviour_auto_dtype(points_no_prep): + """Checks that check_points allows by default every type if using a + preprocessor, and numeric types if using no preprocessor""" + points_prep = ['img1.png', 'img2.png', 'img3.png', 'img5.png'] + with pytest.warns(None) as record: + check_points(points_prep, preprocessor=True) + assert len(record) == 0 + + with pytest.warns(None) as record: + check_points(points_no_prep) # numeric type + assert len(record) == 0 + + # not numeric type + points_no_prep = np.array(['img1.png', 'img2.png', 'img3.png', + 'img5.png']) + points_no_prep = points_no_prep.astype(object) + with pytest.raises(ValueError): + check_points(points_no_prep) + + +def test_check_points_invalid_complex_data(): + """Checks that the right error message is thrown if given complex data ( + this comes from sklearn's check_array's message)""" + points = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], + [[1 + 3j, 2 + 4j], [5 + 8j, 1 + 7j]]]) msg = ("Complex data not supported\n" - "{}\n".format(X)) + "{}\n".format(points)) with pytest.raises(ValueError) as raised_error: - check_tuples(X) + check_points(points) assert str(raised_error.value) == msg + + +# ----------------------------- Test preprocessor ----------------------------- + + +X = np.array([[0.89, 0.11, 1.48, 0.12], + [2.63, 1.08, 1.68, 0.46], + [1.00, 0.59, 0.62, 1.15]]) + + +class MockFileLoader: + """Preprocessor that takes a root file path at construction and simulates + fetching the file in the specific root folder when given the name of the + file""" + + def __init__(self, root): + self.root = root + self.folders = {'fake_root': {'img0.png': X[0], + 'img1.png': X[1], + 'img2.png': X[2] + }, + 'other_folder': {} # empty folder + } + + def __call__(self, path_list): + images = list() + for path in path_list: + images.append(self.folders[self.root][path]) + return np.array(images) + + +def mock_id_loader(list_of_indicators): + """A preprocessor as a function that takes indicators (strings) and + returns the corresponding samples""" + points = [] + for indicator in list_of_indicators: + points.append(X[int(indicator[2:])]) + return np.array(points) + + +tuples_list = [np.array([[0, 1], + [2, 1]]), + + np.array([['img0.png', 'img1.png'], + ['img2.png', 'img1.png']]), + + np.array([['id0', 'id1'], + ['id2', 'id1']]) + ] + +points_list = [np.array([0, 1, 2, 1]), + + np.array(['img0.png', 'img1.png', 'img2.png', 'img1.png']), + + np.array(['id0', 'id1', 'id2', 'id1']) + ] + +preprocessors = [X, MockFileLoader('fake_root'), mock_id_loader] + + +@pytest.fixture +def y_tuples(): + y = [-1, 1] + return y + + +@pytest.fixture +def y_points(): + y = [0, 1, 0, 0] + return y + + +@pytest.mark.parametrize('preprocessor, tuples', zip(preprocessors, + tuples_list)) +def test_preprocessor_weakly_supervised(preprocessor, tuples, y_tuples): + """Tests different ways to use the preprocessor argument: an array, + a class callable, and a function callable, with a weakly supervised + algorithm + """ + nca = ITML(preprocessor=preprocessor) + nca.fit(tuples, y_tuples) + + +@pytest.mark.parametrize('preprocessor, points', zip(preprocessors, + points_list)) +def test_preprocessor_supervised(preprocessor, points, y_points): + """Tests different ways to use the preprocessor argument: an array, + a class callable, and a function callable, with a supervised algorithm + """ + lfda = LFDA(preprocessor=preprocessor) + lfda.fit(points, y_points) + + +@pytest.mark.parametrize('estimator', ['NCA', NCA(), None]) +def test_preprocess_tuples_invalid_message(estimator): + """Checks that if the preprocessor does some weird stuff, the preprocessed + input is detected as weird. Checks this for preprocess_tuples.""" + + if estimator is not None: + estimator_name = make_name(estimator) + (' after the preprocessor ' + 'has been applied') + else: + estimator_name = ('objects that will use preprocessed tuples') + context = make_context(estimator_name) + + def preprocessor(sequence): + return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D + + with pytest.raises(ValueError) as raised_error: + preprocess_tuples(np.ones((3, 2)), + estimator=estimator, preprocessor=preprocessor) + expected_msg = ("3D array of formed tuples expected{}. Found 4D " + "array instead:\ninput={}. Reshape your data{}.\n" + .format(context, np.ones((3, 2, 2, 2)), + ' and/or use a preprocessor' if preprocessor + is not None else '')) + assert str(raised_error.value) == expected_msg + + +@pytest.mark.parametrize('estimator', ['NCA', NCA(), None]) +def test_preprocess_points_invalid_message(estimator): + """Checks that if the preprocessor does some weird stuff, the preprocessed + input is detected as weird. Checks this for preprocess_points.""" + + if estimator is not None: + estimator_name = make_name(estimator) + (' after the preprocessor ' + 'has been applied') + else: + estimator_name = ('objects that will use preprocessed points') + context = make_context(estimator_name) + + def preprocessor(sequence): + return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D + + with pytest.raises(ValueError) as raised_error: + preprocess_points(np.ones((3,)), + estimator=estimator, preprocessor=preprocessor) + expected_msg = ("2D array of formed points expected{}. " + "Found 3D array instead:\ninput={}. Reshape your data{}.\n" + .format(context, np.ones((3, 2, 2)), + ' and/or use a preprocessor' if preprocessor + is not None else '')) + assert str(raised_error.value) == expected_msg + + +def test_progress_message_preprocessor_points(capsys): + """Tests that when using a preprocessor on points, a message is printed + """ + points = np.array([1, 2, 4]) + + def fun(row): + return [[1, 1], [3, 3], [4, 4]] + + preprocess_points(points, preprocessor=fun) + out, _ = capsys.readouterr() + assert out == "Preprocessing points...\n" + + +def test_progress_message_preprocessor_tuples(capsys): + """Tests that when using a preprocessor on points, a message is printed + """ + tuples = np.array([[1, 2], + [2, 3], + [4, 5]]) + + def fun(row): + return [[1, 1], [3, 3], [4, 4]] + + preprocess_tuples(tuples, preprocessor=fun) + out, _ = capsys.readouterr() + assert out == "Preprocessing tuples...\n" + + +@pytest.mark.parametrize('estimator', [ITML(), LSML(), MMC(), SDML()], + ids=['ITML', 'LSML', 'MMC', 'SDML']) +def test_error_message_t(estimator): + """Tests that if a tuples learner is not given the good number of points + per tuple, it throws an error message""" + estimator = clone(estimator) + set_random_state(estimator) + invalid_pairs = np.array([[[1.3, 6.3], [3., 6.8], [6.5, 4.4]], + [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]]) + y = [1, 1] + with pytest.raises(ValueError) as raised_err: + estimator.fit(invalid_pairs, y) + expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 3 " + "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n" + .format(estimator._t, make_context(estimator), + invalid_pairs)) + assert str(raised_err.value) == expected_msg + + +@pytest.mark.parametrize('estimator', [ITML(), LSML(), MMC(), RCA(), SDML(), + Covariance(), LFDA(), LMNN(), MLKR(), + NCA(), ITML_Supervised(), + LSML_Supervised(), MMC_Supervised(), + RCA_Supervised(), SDML_Supervised()], + ids=['ITML', 'LSML', 'MMC', 'RCA', 'SDML', + 'Covariance', 'LFDA', 'LMNN', 'MLKR', 'NCA', + 'ITML_Supervised', 'LSML_Supervised', + 'MMC_Supervised', 'RCA_Supervised', + 'SDML_Supervised']) +def test_error_message_t_score_pairs(estimator): + """tests that if you want to score_pairs on triplets for instance, it returns + the right error message + """ + estimator = clone(estimator) + set_random_state(estimator) + estimator.check_preprocessor() + triplets = np.array([[[1.3, 6.3], [3., 6.8], [6.5, 4.4]], + [[1.9, 5.3], [1., 7.8], [3.2, 1.2]]]) + with pytest.raises(ValueError) as raised_err: + estimator.score_pairs(triplets) + expected_msg = ("Tuples of 2 element(s) expected{}. Got tuples of 3 " + "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n" + .format(make_context(estimator), triplets)) + assert str(raised_err.value) == expected_msg + + +def test_preprocess_tuples_simple_example(): + """Test the preprocessor on a very simple example of tuples to ensure the + result is as expected""" + array = np.array([[1, 2], + [2, 3], + [4, 5]]) + + def fun(row): + return np.array([[1, 1], [3, 3], [4, 4]]) + + expected_result = np.array([[[1, 1], [1, 1]], + [[3, 3], [3, 3]], + [[4, 4], [4, 4]]]) + + assert (preprocess_tuples(array, fun) == expected_result).all() + + +def test_preprocess_points_simple_example(): + """Test the preprocessor on very simple examples of points to ensure the + result is as expected""" + array = np.array([1, 2, 4]) + + def fun(row): + return [[1, 1], [3, 3], [4, 4]] + + expected_result = np.array([[1, 1], + [3, 3], + [4, 4]]) + + assert (preprocess_points(array, fun) == expected_result).all() + + +# ---------------------------------------------------------------------------- +# test that supervised algorithms using a preprocessor behave consistently +# with their no-preprocessor equivalent + + +Dataset = namedtuple('Dataset', 'formed_points points_indicators labels data') + + +@pytest.fixture +def build_classification(rng): + """Basic array for testing when using a preprocessor""" + X, y = shuffle(*make_blobs(random_state=rng), + random_state=rng) + indices = shuffle(np.arange(X.shape[0]), random_state=rng) + indices = indices.astype(int) + return Dataset(X[indices], indices, y, X) + + +@pytest.fixture +def build_regression(rng): + """Basic array for testing when using a preprocessor""" + X, y = shuffle(*make_regression(n_samples=100, n_features=5, + random_state=rng), + random_state=rng) + indices = shuffle(np.arange(X.shape[0]), random_state=rng) + indices = indices.astype(int) + return Dataset(X[indices], indices, y, X) + + +RNG = check_random_state(0) + +classifiers = [Covariance(), + LFDA(), + LMNN(), + NCA(), + RCA(), + ITML_Supervised(max_iter=5), + LSML_Supervised(), + MMC_Supervised(max_iter=5), + RCA_Supervised(num_chunks=10), # less chunks because we only + # have a few data in the test + SDML_Supervised()] + +regressors = [MLKR()] + +estimators = [(classifier, build_classification(RNG)) for classifier in + classifiers] +estimators += [(regressor, build_regression(RNG)) for regressor in + regressors] + +ids_estimators = list(map(lambda x: x.__class__.__name__, classifiers + + regressors)) + + +@pytest.mark.parametrize('estimator, dataset', estimators, + ids=ids_estimators) +def test_same_with_or_without_preprocessor(estimator, dataset): + + (formed_points_train, formed_points_test, + y_train, y_test, points_indicators_train, + points_indicators_test) = train_test_split(dataset.formed_points, + dataset.labels, + dataset.points_indicators, + random_state=RNG) + + estimator_without_prep = clone(estimator) + set_random_state(estimator_without_prep) + estimator_without_prep.set_params(preprocessor=None) + estimator_without_prep.fit(formed_points_train, y_train) + embedding_without_prep = estimator_without_prep.transform(formed_points_test) + + estimator_with_prep = clone(estimator) + set_random_state(estimator_with_prep) + estimator_with_prep.set_params(preprocessor=dataset.data) + estimator_with_prep.fit(points_indicators_train, y_train) + embedding_with_prep = estimator_with_prep.transform(points_indicators_test) + + estimator_with_prep_formed = clone(estimator) + set_random_state(estimator_with_prep_formed) + estimator_with_prep_formed.set_params(preprocessor=dataset.data) + estimator_with_prep_formed.fit(formed_points_train, y_train) + embedding_with_prep_formed = estimator_with_prep_formed.transform( + formed_points_test) + + # test transform + assert (embedding_with_prep == embedding_without_prep).all() + assert (embedding_with_prep == embedding_with_prep_formed).all() + + # test score_pairs + assert (estimator_without_prep.score_pairs( + formed_points_test[np.array([[0, 2], [5, 3]])]) == + estimator_with_prep.score_pairs( + points_indicators_test[np.array([[0, 2], [5, 3]])])).all() + + assert ( + estimator_with_prep.score_pairs( + points_indicators_test[np.array([[0, 2], [5, 3]])]) == + estimator_with_prep_formed.score_pairs( + formed_points_test[np.array([[0, 2], [5, 3]])])).all() diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index f9e7486a..1c8d93de 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -1,5 +1,5 @@ import pytest -from sklearn.datasets import load_iris +from sklearn.datasets import load_iris, make_regression, make_blobs from sklearn.pipeline import make_pipeline from sklearn.utils import shuffle, check_random_state from sklearn.utils.estimator_checks import is_public_parameter @@ -7,11 +7,14 @@ set_random_state) from sklearn.utils.fixes import signature -from metric_learn import ITML, MMC, SDML, LSML +from metric_learn import (ITML, LFDA, LMNN, LSML, MLKR, MMC, NCA, RCA, SDML, + ITML_Supervised, LSML_Supervised, MMC_Supervised, + SDML_Supervised) from metric_learn.constraints import wrap_pairs, Constraints from sklearn import clone import numpy as np -from sklearn.model_selection import cross_val_score, train_test_split +from sklearn.model_selection import (cross_val_score, cross_val_predict, + train_test_split) RNG = check_random_state(0) @@ -21,7 +24,7 @@ def build_data(): dataset = load_iris() X, y = shuffle(dataset.data, dataset.target, random_state=RNG) - num_constraints = 20 + num_constraints = 50 constraints = Constraints.random_subset(y, random_state=RNG) pairs = constraints.positive_negative_pairs(num_constraints, same_length=True, @@ -29,17 +32,35 @@ def build_data(): return X, pairs +def build_classification(preprocessor): + # test that you can do cross validation on tuples of points with + # a WeaklySupervisedMetricLearner + X, y = shuffle(*make_blobs(), random_state=RNG) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) + + +def build_regression(preprocessor): + # test that you can do cross validation on tuples of points with + # a WeaklySupervisedMetricLearner + X, y = shuffle(*make_regression(n_samples=100, n_features=10), + random_state=RNG) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) + + def build_pairs(preprocessor): # test that you can do cross validation on tuples of points with # a WeaklySupervisedMetricLearner - X, pairs = build_data() + X, indices = build_data() if preprocessor is not None: # if preprocessor, we build a 2D array of pairs of indices - _, y = wrap_pairs(X, pairs) - pairs = np.vstack([np.column_stack(pairs[:2]), np.column_stack(pairs[2:])]) + _, y = wrap_pairs(X, indices) + pairs = np.vstack([np.column_stack(indices[:2]), + np.column_stack(indices[2:])]) else: # if not, we build a 3D array of pairs of samples - pairs, y = wrap_pairs(X, pairs) + pairs, y = wrap_pairs(X, indices) pairs, y = shuffle(pairs, y, random_state=RNG) (pairs_train, pairs_test, y_train, y_test) = train_test_split(pairs, y, random_state=RNG) @@ -50,8 +71,8 @@ def build_pairs(preprocessor): def build_quadruplets(preprocessor): # test that you can do cross validation on a tuples of points with # a WeaklySupervisedMetricLearner - X, pairs = build_data() - c = np.column_stack(pairs) + X, indices = build_data() + c = np.column_stack(indices) if preprocessor is not None: # if preprocessor, we build a 2D array of quadruplets of indices quadruplets = c @@ -67,16 +88,34 @@ def build_quadruplets(preprocessor): list_estimators = [(ITML(), build_pairs), + (LFDA(), build_classification), + (LMNN(), build_classification), (LSML(), build_quadruplets), + (MLKR(), build_regression), (MMC(max_iter=2), build_pairs), # max_iter=2 for faster # testing - (SDML(), build_pairs) + (NCA(), build_classification), + (RCA(), build_classification), + (SDML(), build_pairs), + (ITML_Supervised(), build_classification), + (LSML_Supervised(), build_classification), + (MMC_Supervised(), build_classification), + (SDML_Supervised(), build_classification) ] ids_estimators = ['itml', + 'lfda', + 'lmnn', 'lsml', + 'mlkr', 'mmc', + 'nca', + 'rca', 'sdml', + 'itml_supervised', + 'lsml_supervised', + 'mmc_supervised', + 'sdml_supervised' ] @@ -84,38 +123,44 @@ def build_quadruplets(preprocessor): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_cross_validation(estimator, build_dataset, preprocessor): - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - set_random_state(estimator) - - assert np.isfinite(cross_val_score(estimator, tuples, y)).all() + if any(hasattr(estimator, method) for method in ["predict", "score"]): + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + if hasattr(estimator, "score"): + assert np.isfinite(cross_val_score(estimator, tuples, y)).all() + if hasattr(estimator, "predict"): + assert np.isfinite(cross_val_predict(estimator, tuples, y)).all() def check_score(estimator, tuples, y): - score = estimator.score(tuples, y) - assert np.isfinite(score) + if hasattr(estimator, "score"): + score = estimator.score(tuples, y) + assert np.isfinite(score) def check_predict(estimator, tuples): - y_predicted = estimator.predict(tuples) - assert len(y_predicted), len(tuples) + if hasattr(estimator, "predict"): + y_predicted = estimator.predict(tuples) + assert len(y_predicted), len(tuples) @pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_simple_estimator(estimator, build_dataset, preprocessor): - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - set_random_state(estimator) + if any(hasattr(estimator, method) for method in ["predict", "score"]): + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) - estimator.fit(tuples_train, y_train) - check_score(estimator, tuples_test, y_test) - check_predict(estimator, tuples_test) + estimator.fit(tuples_train, y_train) + check_score(estimator, tuples_test, y_test) + check_predict(estimator, tuples_test) @pytest.mark.parametrize('estimator', [est[0] for est in list_estimators], @@ -217,6 +262,83 @@ def test_dict_unchanged(estimator, build_dataset, preprocessor): % method) +@pytest.mark.parametrize('estimator, build_dataset', + [(ITML(), build_pairs), + (LSML(), build_quadruplets), + (MMC(max_iter=2), build_pairs), + (SDML(), build_pairs)], + ids=['itml', 'lsml', 'mmc', 'sdml']) +def test_same_result_with_or_without_preprocessor(estimator, build_dataset): + (X, tuples, y, tuples_train, tuples_test, y_train, + y_test, _) = build_dataset(preprocessor=True) + formed_tuples_train = X[tuples_train] + formed_tuples_test = X[tuples_test] + + estimator_with_preprocessor = clone(estimator) + set_random_state(estimator_with_preprocessor) + estimator_with_preprocessor.set_params(preprocessor=X) + if estimator.__class__.__name__ == 'LSML': + estimator_with_preprocessor.fit(tuples_train) + else: + estimator_with_preprocessor.fit(tuples_train, y_train) + + estimator_without_preprocessor = clone(estimator) + set_random_state(estimator_without_preprocessor) + estimator_without_preprocessor.set_params(preprocessor=None) + if estimator.__class__.__name__ == 'LSML': + estimator_without_preprocessor.fit(formed_tuples_train) + else: + estimator_without_preprocessor.fit(formed_tuples_train, y_train) + + estimator_with_prep_formed = clone(estimator) + set_random_state(estimator_with_prep_formed) + estimator_with_prep_formed.set_params(preprocessor=X) + if estimator.__class__.__name__ == 'LSML': + estimator_with_prep_formed.fit(tuples_train) + else: + estimator_with_prep_formed.fit(tuples_train, y_train) + + # test prediction methods + for method in ["predict", "decision_function"]: + if hasattr(estimator, method): + output_with_prep = getattr(estimator_with_preprocessor, + method)(tuples_test) + output_without_prep = getattr(estimator_without_preprocessor, + method)(formed_tuples_test) + assert np.array(output_with_prep == output_without_prep).all() + output_with_prep = getattr(estimator_with_preprocessor, + method)(tuples_test) + output_with_prep_formed = getattr(estimator_with_prep_formed, + method)(formed_tuples_test) + assert np.array(output_with_prep == output_with_prep_formed).all() + + # test score_pairs + output_with_prep = estimator_with_preprocessor.score_pairs( + tuples_test[:, :2]) + output_without_prep = estimator_without_preprocessor.score_pairs( + formed_tuples_test[:, :2]) + assert np.array(output_with_prep == output_without_prep).all() + + output_with_prep = estimator_with_preprocessor.score_pairs( + tuples_test[:, :2]) + output_without_prep = estimator_with_prep_formed.score_pairs( + formed_tuples_test[:, :2]) + assert np.array(output_with_prep == output_without_prep).all() + + # test transform + output_with_prep = estimator_with_preprocessor.transform( + tuples_test[:, 0]) + output_without_prep = estimator_without_preprocessor.transform( + formed_tuples_test[:, 0]) + assert np.array(output_with_prep == output_without_prep).all() + + output_with_prep = estimator_with_preprocessor.transform( + tuples_test[:, 0]) + output_without_prep = estimator_with_prep_formed.transform( + formed_tuples_test[:, 0]) + assert np.array(output_with_prep == output_without_prep).all() + + @pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) @@ -263,87 +385,20 @@ def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): def _get_args(function, varargs=False): - """Helper to get function arguments""" - - try: - params = signature(function).parameters - except ValueError: - # Error on builtin C function - return [] - args = [key for key, param in params.items() - if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] - if varargs: - varargs = [param.name for param in params.values() - if param.kind == param.VAR_POSITIONAL] - if len(varargs) == 0: - varargs = None - return args, varargs - else: - return args - - -# ----------------------------- Test preprocessor ----------------------------- - - -X = np.array([[0.89, 0.11, 1.48, 0.12], - [2.63, 1.08, 1.68, 0.46], - [1.00, 0.59, 0.62, 1.15]]) - - -class MockFileLoader: - """Preprocessor that takes a root file path at construction and simulates - fetching the file in the specific root folder when given the name of the - file""" - - def __init__(self, root): - self.root = root - self.folders = {'fake_root': {'img0.png': X[0], - 'img1.png': X[1], - 'img2.png': X[2] - }, - 'other_folder': {} # empty folder - } - - def __call__(self, path_list): - images = list() - for path in path_list: - images.append(self.folders[self.root][path]) - return np.array(images) - - -def mock_id_loader(list_of_indicators): - """A preprocessor as a function that takes indicators (strings) and - returns the corresponding samples""" - points = [] - for indicator in list_of_indicators: - points.append(X[int(indicator[2:])]) - return np.array(points) - - -tuples_list = [np.array([[0, 1], - [2, 1]]), - - np.array([['img0.png', 'img1.png'], - ['img2.png', 'img1.png']]), - - np.array([['id0', 'id1'], - ['id2', 'id1']]) - ] - -preprocessors = [X, MockFileLoader('fake_root'), mock_id_loader] - - -@pytest.fixture -def y_tuples(): - y = [-1, 1] - return y - - -@pytest.mark.parametrize('preprocessor, tuples', zip(preprocessors, - tuples_list)) -def test_preprocessor(preprocessor, tuples, y_tuples): - """Tests different ways to use the preprocessor argument: an array, - a class callable, and a function callable - """ - nca = ITML(preprocessor=preprocessor) - nca.fit(tuples, y_tuples) + """Helper to get function arguments""" + + try: + params = signature(function).parameters + except ValueError: + # Error on builtin C function + return [] + args = [key for key, param in params.items() + if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] + if varargs: + varargs = [param.name for param in params.values() + if param.kind == param.VAR_POSITIONAL] + if len(varargs) == 0: + varargs = None + return args, varargs + else: + return args From 6ae7ba5205ec7e8873ce7991f14c5770ba5fefed Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 24 Sep 2018 10:49:39 +0200 Subject: [PATCH 048/120] TST: add random seed for _Supervised classes --- test/test_utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 84c224f5..d52bf266 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -734,22 +734,31 @@ def test_same_with_or_without_preprocessor(estimator, dataset): dataset.points_indicators, random_state=RNG) + def make_random_state(estimator): + rs = {} + if estimator.__class__.__name__[-11:] == '_Supervised': + rs['random_state'] = check_random_state(0) + return rs + estimator_without_prep = clone(estimator) set_random_state(estimator_without_prep) estimator_without_prep.set_params(preprocessor=None) - estimator_without_prep.fit(formed_points_train, y_train) + estimator_without_prep.fit(formed_points_train, y_train, + **make_random_state(estimator)) embedding_without_prep = estimator_without_prep.transform(formed_points_test) estimator_with_prep = clone(estimator) set_random_state(estimator_with_prep) estimator_with_prep.set_params(preprocessor=dataset.data) - estimator_with_prep.fit(points_indicators_train, y_train) + estimator_with_prep.fit(points_indicators_train, y_train, + **make_random_state(estimator)) embedding_with_prep = estimator_with_prep.transform(points_indicators_test) estimator_with_prep_formed = clone(estimator) set_random_state(estimator_with_prep_formed) estimator_with_prep_formed.set_params(preprocessor=dataset.data) - estimator_with_prep_formed.fit(formed_points_train, y_train) + estimator_with_prep_formed.fit(formed_points_train, y_train, + **make_random_state(estimator)) embedding_with_prep_formed = estimator_with_prep_formed.transform( formed_points_test) From a1c8a670b6a584077f05b7fa10cb0444c94f43d1 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 24 Sep 2018 14:39:01 +0200 Subject: [PATCH 049/120] TST: Adapt test pipeline - use random state for _Supervised classes - call transform only for pipeline with a TransformerMixin --- test/test_weakly_supervised.py | 43 ++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 1c8d93de..98007423 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -1,4 +1,5 @@ import pytest +from sklearn.base import TransformerMixin from sklearn.datasets import load_iris, make_regression, make_blobs from sklearn.pipeline import make_pipeline from sklearn.utils import shuffle, check_random_state @@ -214,22 +215,38 @@ def test_estimators_fit_returns_self(estimator, build_dataset, preprocessor): def test_pipeline_consistency(estimator, build_dataset, preprocessor): # Adapted from scikit learn # check that make_pipeline(est) gives same score as est - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) + (_, inputs, y, _, _, _, _, preprocessor) = build_dataset(preprocessor) + + def make_random_state(estimator, in_pipeline): + rs = {} + name_estimator = estimator.__class__.__name__ + if name_estimator[-11:] == '_Supervised': + name_param = 'random_state' + if in_pipeline: + name_param = name_estimator.lower() + '__' + name_param + rs[name_param] = check_random_state(0) + return rs + estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) pipeline = make_pipeline(estimator) - estimator.fit(tuples, y) - pipeline.fit(tuples, y) - - funcs = ["score", "fit_transform"] - - for func_name in funcs: - func = getattr(estimator, func_name, None) - if func is not None: - func_pipeline = getattr(pipeline, func_name) - result = func(tuples, y) - result_pipe = func_pipeline(tuples, y) + estimator.fit(inputs, y, **make_random_state(estimator, False)) + pipeline.fit(inputs, y, **make_random_state(estimator, True)) + + if hasattr(estimator, 'score'): + result = estimator.score(inputs, y) + result_pipe = pipeline.score(inputs, y) + assert_allclose_dense_sparse(result, result_pipe) + + if hasattr(estimator, 'predict'): + result = estimator.predict(inputs) + result_pipe = pipeline.predict(inputs) + assert_allclose_dense_sparse(result, result_pipe) + + if issubclass(estimator.__class__, TransformerMixin): + if hasattr(estimator, 'transform'): + result = estimator.transform(inputs) + result_pipe = pipeline.transform(inputs) assert_allclose_dense_sparse(result, result_pipe) From 735f9757133015943f8c9ca87626f210c849be0d Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 24 Sep 2018 16:27:14 +0200 Subject: [PATCH 050/120] TST: fix test_progress_message_preprocessor_tuples by making func return an np.array --- test/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index d52bf266..66932d22 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -584,7 +584,7 @@ def test_progress_message_preprocessor_tuples(capsys): [4, 5]]) def fun(row): - return [[1, 1], [3, 3], [4, 4]] + return np.array([[1, 1], [3, 3], [4, 4]]) preprocess_tuples(tuples, preprocessor=fun) out, _ = capsys.readouterr() From 35862089640c8715f41aaf2a38a909d9a46144e3 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 8 Oct 2018 15:33:28 +0200 Subject: [PATCH 051/120] Remove deprecated cross_validation import and put model_selection instead --- test/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 66932d22..96fdc24a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,7 +1,7 @@ import pytest from collections import namedtuple import numpy as np -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.exceptions import DataConversionWarning from sklearn.utils import check_random_state, shuffle from sklearn.utils.testing import set_random_state From 51d7e0742b5183a2cbf515fa532ca33be2076177 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 15 Oct 2018 13:08:50 +0200 Subject: [PATCH 052/120] WIP replace checks by unique check_input function --- metric_learn/_util.py | 573 +++++++-------------------------- metric_learn/base_metric.py | 33 +- metric_learn/covariance.py | 9 +- metric_learn/itml.py | 17 +- metric_learn/lfda.py | 6 +- metric_learn/lmnn.py | 23 +- metric_learn/lsml.py | 18 +- metric_learn/mlkr.py | 9 +- metric_learn/mmc.py | 16 +- metric_learn/nca.py | 8 +- metric_learn/rca.py | 14 +- metric_learn/sdml.py | 16 +- test/test_mahalanobis_mixin.py | 9 +- test/test_utils.py | 210 ++++++------ test/test_weakly_supervised.py | 8 +- 15 files changed, 320 insertions(+), 649 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index f547c9c4..516eb135 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -1,8 +1,7 @@ import numpy as np import six -from sklearn.utils import check_array, column_or_1d -from sklearn.utils.validation import (_assert_all_finite, - check_consistent_length) +from sklearn.utils import check_array +from sklearn.utils.validation import check_X_y # hack around lack of axis kwarg in older numpy versions try: @@ -15,475 +14,141 @@ def vector_norm(X): return np.linalg.norm(X, axis=1) -def check_tuples(tuples, preprocessor=False, t=None, dtype="auto", - order=None, copy=False, force_all_finite=True, - ensure_min_samples=1, ensure_min_features=1, - warn_on_dtype=False, estimator=None): - """Check that `tuples` is a valid array of tuples. - - Depending on whether a preprocessor is available or not, `tuples` should be: - - a 3D array of formed tuples or a 2D array of tuples of indicators if a - preprocessor is available - - a 3D array of formed tuples if no preprocessor is available - - The number of elements in a tuple (e.g. 2 for pairs) should be the right - one, specified by the parameter `t`. - `check_tuples` will then convert the tuples to the right format as - `sklearn.utils.validation.check_array` would do. See - `sklearn.utils.validation.check_array` for more details. +def check_input(input, y=None, preprocessor=None, + type_of_inputs='classic', t=None, accept_sparse=False, + dtype="numeric", order=None, + copy=False, force_all_finite=True, + multi_output=False, ensure_min_samples=1, + ensure_min_features=1, y_numeric=False, + warn_on_dtype=False, estimator=None): + """Checks that the input format is valid and does conversions if specified + (this is the equivalent of scikit-learn's `check_array` or `check_X_y`). + All arguments following t are scikit-learn's `check_array` or `check_X_y` + arguments that will be enforced on the output array Parameters ---------- - tuples : object - The tuples to check. - - t : int or None (default=None) - The number of elements to ensure there is in every tuple (e.g. 2 for - pairs). If None, the number of tuples is not checked. - - dtype : string, type, list of types or None (default="auto") - Data type of result. If None, the dtype of the input is preserved. - If "numeric", dtype is preserved unless array.dtype is object. - If dtype is a list of types, conversion on the first type is only - performed if the dtype of the input is not in the list. If - "auto", will we be set to "numeric" if `preprocessor=True`, - else to None. - - order : 'F', 'C' or None (default=None) - Whether an array will be forced to be fortran or c-style. - - copy : boolean (default=False) - Whether a forced copy will be triggered. If copy=False, a copy might - be triggered by a conversion. - - force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in `tuples`. - This parameter does not influence whether y can have np.inf or np.nan - values. The possibilities are: - - - True: Force all values of `tuples` to be finite. - - False: accept both np.inf and np.nan in `tuples`. - - 'allow-nan': accept only np.nan values in `tuples`. Values - cannot be infinite. - - ensure_min_samples : int (default=1) - Make sure that `tuples` has a minimum number of samples in its first - axis (rows for a 2D array). - - ensure_min_features : int (default=1) - Only used when using no preprocessor. Make sure that each point in the 3D - array of tuples has some minimum number of features (axis=2). The default - value of 1 rejects empty datasets. This check is only enforced when X has - effectively 3 dimensions. Setting to 0 disables this check. - - warn_on_dtype : boolean (default=False) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. - - estimator : str or estimator instance (default=None) - If passed, include the name of the estimator in warning messages. + input : object + The input to check + y : object (optional, default=None) + The + preprocessor + type_of_inputs + t + accept_sparse + dtype + order + copy + force_all_finite + multi_output + ensure_min_samples + ensure_min_features + y_numeric + warn_on_dtype + estimator Returns ------- - tuples_valid : object - The validated tuples. - """ - if dtype == "auto": - dtype = 'numeric' if not preprocessor else None - - context = make_context(estimator) - tuples = check_array(tuples, dtype=dtype, accept_sparse=False, copy=copy, - force_all_finite=force_all_finite, - order=order, - ensure_2d=False, # tuples can be 2D or 3D - allow_nd=True, - ensure_min_samples=ensure_min_samples, - # ensure_min_features only works if ndim=2, so we will - # have to check again if input is 3D (see below) - ensure_min_features=0, - # if 2D and preprocessor, no notion of - # "features". If 3D and no preprocessor, min_features - # is checked below - estimator=estimator, - warn_on_dtype=warn_on_dtype) - - if tuples.ndim == 2 and preprocessor: # in this case there is left to check - # if t is OK - check_t(tuples, t, context) - elif tuples.ndim == 3: - # if the dimension is 3 we still have to check that the num_features is OK - if ensure_min_features > 0: - n_features = tuples.shape[2] - if n_features < ensure_min_features: - raise ValueError("Found array with {} feature(s) (shape={}) while" - " a minimum of {} is required{}." - .format(n_features, tuples.shape, ensure_min_features, - context)) - # then we should also check that t is OK - check_t(tuples, t, context) - else: - expected_shape = ('2D array of indicators or 3D array of formed tuples' - if preprocessor else '3D array of formed tuples') - with_prep = ' when using a preprocessor' if preprocessor else '' - should_use_prep = ' and/or use a preprocessor' if not preprocessor else '' - raise ValueError("{} expected{}{}. Found {}D array " - "instead:\ninput={}. Reshape your data{}.\n" - .format(expected_shape, context, with_prep, - tuples.ndim, tuples, should_use_prep)) - return tuples - - -def check_tuples_y(tuples, y, preprocessor=False, t=None, dtype="auto", - order=None, copy=False, force_all_finite=True, - multi_output=False, ensure_min_samples=1, - ensure_min_features=1, y_numeric=False, - warn_on_dtype=False, estimator=None): - """Input validation for standard estimators. - - Adapted from `sklearn.utils.validation.check_X_y`. - Checks tuples with `check_tuples`, and checks that the size of `y` and - `tuples` are consistent. In addition, standard input checks are only - applied to y, such as checking that y does not have np.nan or np.inf - targets. For multi-label y, set multi_output=True to allow 2d and sparse y. - - Parameters - ---------- - tuples : 3D array of formed tuples or 2D array of tuples indicators - Input tuples. - - y : nd-array, list or sparse matrix - Labels. - - preprocessor : boolean - Whether a preprocessor is available or not (the input format depends - on that) (See `check_tuples` for more information) - dtype : string, type, list of types or None (default="numeric") - Data type of result. If None, the dtype of the input is preserved. - If "numeric", dtype is preserved unless array.dtype is object. - If dtype is a list of types, conversion on the first type is only - performed if the dtype of the input is not in the list. - - order : 'F', 'C' or None (default=None) - Whether an array will be forced to be fortran or c-style. - - copy : boolean (default=False) - Whether a forced copy will be triggered. If copy=False, a copy might - be triggered by a conversion. - - force_all_finite : boolean (default=True) - Whether to raise an error on np.inf and np.nan in X. This parameter - does not influence whether y can have np.inf or np.nan values. - - multi_output : boolean (default=False) - Whether to allow 2-d y (array or sparse matrix). If false, y will be - validated as a vector. y cannot have np.nan or np.inf values if - multi_output=True. - - ensure_min_samples : int (default=1) - Make sure that X has a minimum number of samples in its first - axis (rows for a 2D array). - - ensure_min_features : int (default=1) - Make sure that the tuples has some minimum number of features - The default value of 1 rejects empty datasets. - Setting to 0 disables this check. - - y_numeric : boolean (default=False) - Whether to ensure that y has a numeric type. If dtype of y is object, - it is converted to float64. Should only be used for regression - algorithms. - - warn_on_dtype : boolean (default=False) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. - - estimator : str or estimator instance (default=None) - If passed, include the name of the estimator in warning messages. - - Returns - ------- - tuples_converted : object - The converted and validated tuples. - - y_converted : object - The converted and validated y. """ - tuples = check_tuples(tuples, - preprocessor=preprocessor, - t=t, - dtype=dtype, - order=order, copy=copy, - force_all_finite=force_all_finite, - ensure_min_samples=ensure_min_samples, - ensure_min_features=ensure_min_features, - warn_on_dtype=warn_on_dtype, - estimator=estimator) - if multi_output: - y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, - dtype=None) - else: - y = column_or_1d(y, warn=True) - _assert_all_finite(y) - if y_numeric and y.dtype.kind == 'O': - y = y.astype(np.float64) - - check_consistent_length(tuples, y) - - return tuples, y - - -def check_points(points, preprocessor=False, accept_sparse=False, - dtype="auto", order=None, copy=False, force_all_finite=True, - ensure_min_samples=1, ensure_min_features=1, - warn_on_dtype=False, estimator=None): - """Checks that `points` is a valid dataset of points - - Depending on whether a preprocessor is available or not, `points` should - be: - - a 2D array of formed points or a 1D array of indicators if a - preprocessor is available - - a 3D array of formed tuples if no preprocessor is available - - `check_points` will then convert the points to the right format as - `sklearn.utils.validation.check_array` would do. See - `sklearn.utils.validation.check_array` for more details. - - Parameters - ---------- - points : object - Input object to check / convert. - - accept_sparse : string, boolean or list/tuple of strings (default=False) - String[s] representing allowed sparse matrix formats, such as 'csc', - 'csr', etc. If the input is sparse but not in the allowed format, - it will be converted to the first listed format. True allows the input - to be any format. False means that a sparse matrix input will - raise an error. - - dtype : string, type, list of types or None (default="numeric") - Data type of result. If None, the dtype of the input is preserved. - If "numeric", dtype is preserved unless points.dtype is object. - If dtype is a list of types, conversion on the first type is only - performed if the dtype of the input is not in the list. - - order : 'F', 'C' or None (default=None) - Whether an array will be forced to be fortran or c-style. - When order is None (default), then if copy=False, nothing is ensured - about the memory layout of the output array; otherwise (copy=True) - the memory layout of the returned array is kept as close as possible - to the original array. - - copy : boolean (default=False) - Whether a forced copy will be triggered. If copy=False, a copy might - be triggered by a conversion. - - force_all_finite : boolean (default=True) - Whether to raise an error on np.inf and np.nan in `points`. - - ensure_min_samples : int (default=1) - Make sure that the array has a minimum number of samples in its first - axis (rows for a 2D array). Setting to 0 disables this check. - - ensure_min_features : int (default=1) - Make sure that the 2D array has some minimum number of features - (columns). The default value of 1 rejects empty datasets. - This check is only enforced when the input data has effectively 2 - dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 - disables this check. - - warn_on_dtype : boolean (default=False) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. - - estimator : str or estimator instance (default=None) - If passed, include the name of the estimator in warning messages. - - Returns - ------- - points_converted : object - The converted and validated array of points. - - """ - if dtype == "auto": - dtype = 'numeric' if preprocessor is not None else None + # todo: faire attention a la copie + # todo: faire attention aux trucs sparses context = make_context(estimator) - points = check_array(points, dtype=dtype, accept_sparse=accept_sparse, - copy=copy, - force_all_finite=force_all_finite, - order=order, - ensure_2d=False, # input can be 1D - allow_nd=True, # true, to throw custom error message - ensure_min_samples=ensure_min_samples, - ensure_min_features=ensure_min_features, - estimator=estimator, - warn_on_dtype=warn_on_dtype) - if (points.ndim == 1 and preprocessor) or points.ndim == 2: - return points - else: - expected_shape = ('1D array of indicators or 2D array of formed points' - if preprocessor else '2D array of formed points') - with_prep = ' when using a preprocessor' if preprocessor else '' - should_use_prep = ' and/or use a preprocessor' if not preprocessor else '' - raise ValueError("{} expected{}{}. Found {}D array " - "instead:\ninput={}. Reshape your data{}.\n" - .format(expected_shape, context, with_prep, - points.ndim, points, should_use_prep)) - return points - - -def check_points_y(points, y, preprocessor=False, accept_sparse=False, - dtype="auto", - order=None, copy=False, force_all_finite=True, - multi_output=False, ensure_min_samples=1, - ensure_min_features=1, y_numeric=False, - warn_on_dtype=False, estimator=None): - """Input validation for standard estimators. - - Checks `points` and `y` for consistent length, enforces `points` is a 2d - array of formed points or 1d array of indicators of points, and y is 1d. - Standard input checks are only applied to y, such as checking that y - does not have np.nan or np.inf targets. For multi-label y, set - multi_output=True to allow 2d and sparse y. If the dtype of `points` is - object, attempt converting to float, raising on failure. - Adapted from :func:`sklearn.utils.validation.check_X_y`. - - Parameters - ---------- - points : nd-array, list or sparse matrix - Input data. - - y : nd-array, list or sparse matrix - Labels. - - accept_sparse : string, boolean or list of string (default=False) - String[s] representing allowed sparse matrix formats, such as 'csc', - 'csr', etc. If the input is sparse but not in the allowed format, - it will be converted to the first listed format. True allows the input - to be any format. False means that a sparse matrix input will - raise an error. - - .. deprecated:: 0.19 - Passing 'None' to parameter ``accept_sparse`` in methods is - deprecated in version 0.19 "and will be removed in 0.21. Use - ``accept_sparse=False`` instead. - - dtype : string, type, list of types or None (default="numeric") - Data type of result. If None, the dtype of the input is preserved. - If "numeric", dtype is preserved unless array.dtype is object. - If dtype is a list of types, conversion on the first type is only - performed if the dtype of the input is not in the list. - - order : 'F', 'C' or None (default=None) - Whether an array will be forced to be fortran or c-style. - copy : boolean (default=False) - Whether a forced copy will be triggered. If copy=False, a copy might - be triggered by a conversion. - - force_all_finite : boolean (default=True) - Whether to raise an error on np.inf and np.nan in `points`. - This parameter does not influence whether y can have np.inf or np.nan - values. - - ensure_2d : boolean (default=True) - Whether to make `points` at least 2d. - - allow_nd : boolean (default=False) - Whether to allow points.ndim > 2. - - multi_output : boolean (default=False) - Whether to allow 2-d y (array or sparse matrix). If false, y will be - validated as a vector. y cannot have np.nan or np.inf values if - multi_output=True. - - ensure_min_samples : int (default=1) - Make sure that `points` has a minimum number of samples in its first - axis (rows for a 2D array). - - ensure_min_features : int (default=1) - Make sure that the 2D array has some minimum number of features - (columns). The default value of 1 rejects empty datasets. - This check is only enforced when `points` has effectively 2 dimensions or - is originally 1D and ``ensure_2d`` is True. Setting to 0 disables - this check. - - y_numeric : boolean (default=False) - Whether to ensure that y has a numeric type. If dtype of y is object, - it is converted to float64. Should only be used for regression - algorithms. - - warn_on_dtype : boolean (default=False) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. - - estimator : str or estimator instance (default=None) - If passed, include the name of the estimator in warning messages. - - Returns - ------- - points_converted : object - The converted and validated points. - - y_converted : object - The converted and validated y. - """ - points = check_points(points, preprocessor=preprocessor, - accept_sparse=accept_sparse, - dtype=dtype, - order=order, copy=copy, - force_all_finite=force_all_finite, - ensure_min_samples=ensure_min_samples, - ensure_min_features=ensure_min_features, - warn_on_dtype=warn_on_dtype, - estimator=estimator) - if multi_output: - y = check_points(y, 'csr', force_all_finite=True, dtype=None, - preprocessor=preprocessor) - else: - y = column_or_1d(y, warn=True) - _assert_all_finite(y) - if y_numeric and y.dtype.kind == 'O': - y = y.astype(np.float64) - - check_consistent_length(points, y) - - return points, y - - -def preprocess_tuples(tuples, preprocessor, estimator=None): - """form tuples if there is a preprocessor else keep them as such (assumes - that check_tuples has already been called)""" - if estimator is not None: - estimator_name = make_name(estimator) + (' after the preprocessor ' - 'has been applied') + args_for_sk_checks = dict(accept_sparse=accept_sparse, + dtype=dtype, order=order, + copy=copy, force_all_finite=force_all_finite, + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, + warn_on_dtype=warn_on_dtype, estimator=estimator) + if y is None: + input = np.array(input, copy=False) else: - estimator_name = ('objects that will use preprocessed tuples') + input, y = check_X_y(input, y, ensure_2d=False, allow_nd=True, + copy=False, force_all_finite=False, + accept_sparse=True, dtype=None, + ensure_min_features=0, ensure_min_samples=0, + multi_output=multi_output, + y_numeric=y_numeric) + # we try to allow the more possible stuff here + + if type_of_inputs == 'classic': + if input.ndim == 1: + if preprocessor is not None: + input = preprocess_points(input, preprocessor) + else: + raise ValueError('Using no preprocessor, a 2D array of formed ' + 'points is expected{}. Reshape your data ' + 'and/or use a preprocessor'.format(context)) + elif input.ndim == 2: + pass # OK + else: + with_prep = (('1D array of indicators or 2D array of formed points', + ' when using a preprocessor') + if preprocessor is not None else + ('2D array of formed points', '')) + raise ValueError("{} expected{}{}. Found {}D array " + "instead:\ninput={}. Reshape your data.\n" + .format(with_prep[0], context, with_prep[1], + input.ndim, input)) + + input = check_array(input, allow_nd=True, ensure_2d=True, + # arguments that come from before + **args_for_sk_checks) + + elif type_of_inputs == 'tuples': + + if input.ndim == 2: + if preprocessor is not None: + check_t(input, t, context) + input = preprocess_points(input, preprocessor) + else: + + raise ValueError('Using no preprocessor, a 3D array of points ' + 'indicators is expected{}. Reshape your data ' + 'and/or use a preprocessor'.format(context)) + elif input.ndim == 3: # we should check_num_features which is not checked + # after + if ensure_min_features > 0: + n_features = input.shape[2] + if n_features < ensure_min_features: + raise ValueError("Found array with {} feature(s) (shape={}) while" + " a minimum of {} is required{}." + .format(n_features, input.shape, + ensure_min_features, context)) + else: - if preprocessor is not None and tuples.ndim == 2: - print("Preprocessing tuples...") - tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for - i in range(tuples.shape[1])]) - tuples = check_tuples(tuples, preprocessor=False, estimator=estimator_name) - # normally we shouldn't need to enforce the t, since a preprocessor shouldn't - # be able to transform a t tuples array into a t' tuples array + with_prep =(('2D array of indicators or 3D array of formed tuples', + ' when using a preprocessor') + if preprocessor is not None else + ('3D array of formed tuples', '')) + raise ValueError("{} expected{}{}. Found {}D array " + "instead:\ninput={}. Reshape your data.\n" + .format(with_prep[0], context, with_prep[1], + input.ndim, input)) + + check_t(input, t, context) + input = check_array(input, allow_nd=True, ensure_2d=False, + # arguments that come from before + **args_for_sk_checks) + if input.ndim != 3: # we have to ensure this because check_array above + # does not + raise ValueError("Invalid data.") # TODO put a better error message + + return input if y is None else (input, y) + + +def preprocess_tuples(tuples, preprocessor): + tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for + i in range(tuples.shape[1])]) return tuples -def preprocess_points(points, preprocessor, estimator=None): +def preprocess_points(points, preprocessor): """form points if there is a preprocessor else keep them as such (assumes that check_points has already been called)""" - if estimator is not None: - estimator_name = make_name(estimator) + (' after the preprocessor ' - 'has been applied') - else: - estimator_name = ('objects that will use preprocessed points') - - if preprocessor is not None and points.ndim == 1: - print("Preprocessing points...") - points = preprocessor(points) - points = check_points(points, preprocessor=False, estimator=estimator_name) + points = preprocessor(points) return points diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 734fe2b8..0af0dcca 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -5,8 +5,7 @@ import numpy as np from abc import ABCMeta, abstractmethod import six -from ._util import (check_tuples, ArrayIndexer, preprocess_tuples, - check_points, preprocess_points) +from ._util import ArrayIndexer, check_input class BaseMetricLearner(six.with_metaclass(ABCMeta, BaseEstimator)): @@ -106,10 +105,9 @@ def score_pairs(self, pairs): scores: `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. """ - pairs = check_tuples(pairs, preprocessor=self.preprocessor is not None, - estimator=self, t=2) - pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, - estimator=self) + pairs = check_input(pairs, type_of_inputs='tuples', + preprocessor=self.preprocessor_, + estimator=self, t=2) pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) @@ -132,11 +130,9 @@ def transform(self, X): X_embedded : `numpy.ndarray`, shape=(n_samples, num_dims) The embedded data points. """ - X_checked = check_points(X, estimator=self, - preprocessor=self.preprocessor is not None, + X_checked = check_input(X, type_of_inputs='classic', estimator=self, + preprocessor=self.preprocessor_, accept_sparse=True) - X_checked = preprocess_points(X_checked, preprocessor=self.preprocessor_, - estimator=self) return X_checked.dot(self.transformer_.T) def metric(self): @@ -191,13 +187,13 @@ def predict(self, pairs): y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) The predicted learned metric value between samples in every pair. """ - pairs = check_tuples(pairs, preprocessor=self.preprocessor is not None, + pairs = check_input(pairs, type_of_inputs='tuples', + preprocessor=self.preprocessor_, estimator=self, t=self._t) - # no need to preprocess_tuples since it is done in score_pairs return self.score_pairs(pairs) def decision_function(self, pairs): - # no need to check_tuples and preprocess_tuples since it is done in + # no need to check_input since it is done in # predict->score_pairs return self.predict(pairs) @@ -226,7 +222,7 @@ def score(self, pairs, y): score : float The ``roc_auc`` score. """ - # no need to check_tuples and preprocess_tuples since it is done in + # no need to check_input since it is done in # predict->score_pairs return roc_auc_score(y, self.decision_function(pairs)) @@ -254,17 +250,16 @@ def predict(self, quadruplets): prediction : `numpy.ndarray` of floats, shape=(n_constraints,) Predictions of the ordering of pairs, for each quadruplet. """ - quadruplets = check_tuples(quadruplets, - preprocessor=self.preprocessor is not None, + quadruplets = check_input(quadruplets, type_of_inputs='tuples', + preprocessor=self.preprocessor_, estimator=self, t=self._t) - # no need to preprocess_tuples since it is done in score_pairs # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices return (self.score_pairs(quadruplets[:, :2, ...]) - self.score_pairs(quadruplets[:, 2:, ...])) def decision_function(self, quadruplets): - # no need to check_tuples and preprocess_tuples since it is done in + # no need to check_input since it is done in # predict->score_pairs return self.predict(quadruplets) @@ -290,6 +285,6 @@ def score(self, quadruplets, y=None): score : float The quadruplets score. """ - # no need to check_tuples and preprocess_tuples since it is done in + # no need to check_input since it is done in # predict->score_pairs return -np.mean(self.predict(quadruplets)) diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index ac42161e..8237ccc9 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -13,7 +13,7 @@ from sklearn.utils.validation import check_array from sklearn.base import TransformerMixin -from metric_learn._util import preprocess_points, check_points +from metric_learn._util import check_input from .base_metric import MahalanobisMixin @@ -36,10 +36,9 @@ def fit(self, X, y=None): y : unused """ self.check_preprocessor() - self.X_ = check_points(X, ensure_min_samples=2, estimator=self, - preprocessor=self.preprocessor is not None) - self.X_ = preprocess_points(self.X_, preprocessor=self.preprocessor_, - estimator=self) + self.X_ = check_input(X, type_of_inputs='classic', + ensure_min_samples=2, estimator=self, + preprocessor=self.preprocessor_) self.M_ = np.cov(self.X_, rowvar = False) if self.M_.ndim == 0: self.M_ = 1./self.M_ diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 7d6688db..61af88d7 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -21,8 +21,7 @@ from sklearn.base import TransformerMixin from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import (vector_norm, check_points_y, check_tuples_y, - preprocess_tuples, preprocess_points) +from ._util import vector_norm, check_input class _BaseITML(MahalanobisMixin): @@ -62,10 +61,9 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, def _process_pairs(self, pairs, y, bounds): self.check_preprocessor() - pairs, y = check_tuples_y(pairs, y, estimator=self, t=self._t, - preprocessor=self.preprocessor is not None) - pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, - estimator=self) + pairs, y = check_input(pairs, y, type_of_inputs='tuples', + estimator=self, t=self._t, + preprocessor=self.preprocessor_) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -233,10 +231,9 @@ def fit(self, X, y, random_state=np.random): If provided, controls random number generation. """ self.check_preprocessor() - X, y = check_points_y(X, y, preprocessor=self.preprocessor is not None, - estimator=self) - X = preprocess_points(X, preprocessor=self.preprocessor_, - estimator=self) + X, y = check_input(X, y, type_of_inputs='classic', + preprocessor=self.preprocessor_, + estimator=self) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 18ffa057..497e2ecc 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -18,7 +18,7 @@ from sklearn.metrics import pairwise_distances from sklearn.base import TransformerMixin -from metric_learn._util import check_points_y, preprocess_points +from metric_learn._util import check_input from .base_metric import MahalanobisMixin @@ -63,9 +63,7 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted', def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) self.check_preprocessor() - self.X_, y = check_points_y(X, y, estimator=self, - preprocessor=self.preprocessor is not None) - self.X_ = preprocess_points(self.X_, estimator=self, + self.X_, y = check_input(X, y, type_of_inputs='classic', estimator=self, preprocessor=self.preprocessor_) n, d = self.X_.shape num_classes = len(unique_classes) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 470685b5..343a3a22 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -16,7 +16,7 @@ from six.moves import xrange from sklearn.metrics import euclidean_distances from sklearn.base import TransformerMixin -from metric_learn._util import preprocess_points, check_points_y +from metric_learn._util import check_input from .base_metric import MahalanobisMixin @@ -51,14 +51,11 @@ class python_LMNN(_base_LMNN): def _process_inputs(self, X, labels): self.check_preprocessor() - self.X_, labels = check_points_y(X, labels, - preprocessor=self.preprocessor is not - None, - estimator=self) - self.X_ = preprocess_points(self.X_, estimator=self, - preprocessor=self.preprocessor_) + self.X_, labels = check_input(X, labels, type_of_inputs='classic', + preprocessor=self.preprocessor_, + estimator=self) self.X_ = self.X_.astype(float) # todo: remove the conversion here and - # integrate it into check_points_y + # integrate it into check_input num_pts, num_dims = self.X_.shape unique_labels, self.label_inds_ = np.unique(labels, return_inverse=True) if len(self.label_inds_) != num_pts: @@ -263,12 +260,12 @@ class LMNN(_base_LMNN): def fit(self, X, y): self.check_preprocessor() - self.X_, y = check_points_y(X, y, estimator=self, - preprocessor=self.preprocessor is not None) + self.X_, y = check_input(X, y, type_of_inputs='classic', + estimator=self, + preprocessor=self.preprocessor_) self.X_ = self.X_.astype(float) # todo: remove the conversion here and - # integrate it into check_points_y - self.X_ = preprocess_points(self.X_, estimator=self, - preprocessor=self.preprocessor_) + # integrate it into check_input + self.X_ = self.X_, preprocessor=self.preprocessor_ labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 536284d1..352a6229 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -12,8 +12,7 @@ import scipy.linalg from six.moves import xrange from sklearn.base import TransformerMixin -from ._util import (check_tuples, check_points_y, preprocess_points, - preprocess_tuples) +from ._util import check_input from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin from .constraints import Constraints @@ -46,13 +45,10 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, super(_BaseLSML, self).__init__(preprocessor) def _prepare_quadruplets(self, quadruplets, weights): - # for now we check_array and check_tuples but we should only - # check_tuples in the future (with enhanced check_tuples) self.check_preprocessor() - quadruplets = check_tuples(quadruplets, estimator=self, t=self._t, - preprocessor=self.preprocessor is not None) - quadruplets = preprocess_tuples(quadruplets, estimator=self, - preprocessor=self.preprocessor_) + quadruplets = check_input(quadruplets, type_of_inputs='tuples', + estimator=self, t=self._t, + preprocessor=self.preprocessor_) # check to make sure that no two constrained vectors are identical self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :] @@ -223,9 +219,9 @@ def fit(self, X, y, random_state=np.random): If provided, controls random number generation. """ self.check_preprocessor() - X, y = check_points_y(X, y, preprocessor=self.preprocessor is not None, - estimator=self) - X = preprocess_points(X, estimator=self, preprocessor=self.preprocessor_) + X, y = check_input(X, y, type_of_inputs='classic', + preprocessor=self.preprocessor_, + estimator=self) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 68dfc285..83240dce 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -14,7 +14,7 @@ from sklearn.decomposition import PCA -from metric_learn._util import check_points_y, preprocess_points +from metric_learn._util import check_input from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps @@ -60,10 +60,9 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, def _process_inputs(self, X, y): self.check_preprocessor() - self.X_, y = check_points_y(X, y, y_numeric=True, estimator=self, - preprocessor=self.preprocessor is not None) - self.X_ = preprocess_points(self.X_, estimator=self, - preprocessor=self.preprocessor_) + self.X_, y = check_input(X, y, type_of_inputs='classic', + y_numeric=True, estimator=self, + preprocessor=self.preprocessor_) n, d = self.X_.shape if y.shape[0] != n: raise ValueError('Data and label lengths mismatch: %d != %d' diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index af87d5fe..fcbf8ef5 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -24,8 +24,7 @@ from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import (vector_norm, check_points_y, preprocess_points, - check_tuples_y, preprocess_tuples) +from ._util import vector_norm, check_input class _BaseMMC(MahalanobisMixin): @@ -75,10 +74,9 @@ def _fit(self, pairs, y): return self._fit_full(pairs, y) def _process_pairs(self, pairs, y): - pairs, y = check_tuples_y(pairs, y, estimator=self, t=self._t, - preprocessor=self.preprocessor is not None) - pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, - estimator=self) + pairs, y = check_input(pairs, y, type_of_inputs='tuples', + estimator=self, t=self._t, + preprocessor=self.preprocessor_) # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -455,9 +453,9 @@ def fit(self, X, y, random_state=np.random): If provided, controls random number generation. """ self.check_preprocessor() - X, y = check_points_y(X, y, preprocessor=self.preprocessor is not None, - estimator=self) - X = preprocess_points(X, preprocessor=self.preprocessor_, estimator=self) + X, y = check_input(X, y, type_of_inputs='classic', + preprocessor=self.preprocessor_, + estimator=self) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 42d6935e..79f51e34 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -8,7 +8,7 @@ from six.moves import xrange from sklearn.base import TransformerMixin -from metric_learn._util import check_points_y, preprocess_points +from metric_learn._util import check_input from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps @@ -37,10 +37,8 @@ def fit(self, X, y): """ self.check_preprocessor() - X, labels = check_points_y(X, y, estimator=self, - preprocessor=self.preprocessor is not None) - X = preprocess_points(X, estimator=self, - preprocessor=self.preprocessor_) + X, labels = check_input(X, y, type_of_inputs='classic', estimator=self, + preprocessor=self.preprocessor_) n, d = X.shape num_dims = self.num_dims if num_dims is None: diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 2d5e8ea5..90f1f4b4 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -18,7 +18,7 @@ from sklearn import decomposition from sklearn.base import TransformerMixin -from metric_learn._util import check_points_y, check_points, preprocess_points +from metric_learn._util import check_input from .base_metric import MahalanobisMixin from .constraints import Constraints @@ -66,9 +66,9 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): def _process_data(self, X): self.check_preprocessor() - X = check_points(X, preprocessor=self.preprocessor is not None, - estimator=self) - X = preprocess_points(X, preprocessor=self.preprocessor_, estimator=self) + X = check_input(X, type_of_inputs='classic', + preprocessor=self.preprocessor_, + estimator=self) # PCA projection to remove noise and redundant information. if self.pca_comps is not None: @@ -182,10 +182,8 @@ def fit(self, X, y, random_state=np.random): random_state : a random.seed object to fix the random_state if needed. """ self.check_preprocessor() - X, y = check_points_y(X, y, estimator=self, - preprocessor=self.preprocessor is not None) - X = preprocess_points(X, preprocessor=self.preprocessor_, - estimator=self) + X, y = check_input(X, y, type_of_inputs='classic', estimator=self, + preprocessor=self.preprocessor_) chunks = Constraints(y).chunks(num_chunks=self.num_chunks, chunk_size=self.chunk_size, random_state=random_state) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 7acfa12d..16cca5c4 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -14,11 +14,9 @@ from sklearn.covariance import graph_lasso from sklearn.utils.extmath import pinvh -from metric_learn._util import (preprocess_points, preprocess_tuples, - check_tuples_y) +from metric_learn._util import check_input from .base_metric import MahalanobisMixin, _PairsClassifierMixin from .constraints import Constraints, wrap_pairs -from ._util import check_points_y class _BaseSDML(MahalanobisMixin): @@ -54,10 +52,9 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, def _prepare_pairs(self, pairs, y): self.check_preprocessor() - pairs, y = check_tuples_y(pairs, y, estimator=self, t=self._t, - preprocessor=self.preprocessor is not None) - pairs = preprocess_tuples(pairs, preprocessor=self.preprocessor_, - estimator=self) + pairs, y = check_input(pairs, y, type_of_inputs='tuples', + estimator=self, t=self._t, + preprocessor=self.preprocessor_) # set up prior M if self.use_cov: @@ -165,9 +162,8 @@ def fit(self, X, y, random_state=np.random): Returns the instance. """ self.check_preprocessor() - X, y = check_points_y(X, y, estimator=self, - preprocessor=self.preprocessor is not None) - X = preprocess_points(X, estimator=self, preprocessor=self.preprocessor_) + X, y = check_input(X, y, type_of_inputs='classic', estimator=self, + preprocessor=self.preprocessor_) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 09a98ece..43d6199e 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -146,8 +146,9 @@ def test_score_pairs_dim(estimator, build_dataset): "your data has a single feature or tuples.reshape(1, {}, -1) " "if it contains a single tuple.".format(tuples, tuples.shape[1], tuples.shape[0])) - with pytest.raises(ValueError, message=msg): + with pytest.raises(ValueError) as raised_error: model.score_pairs(tuples[1]) + assert str(raised_error.value) == msg def check_is_distance_matrix(pairwise): @@ -191,16 +192,18 @@ def test_embed_dim(estimator, build_dataset): "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " "if it contains a single sample.".format(X)) - with pytest.raises(ValueError, message=err_msg): + with pytest.raises(ValueError) as raised_error: model.score_pairs(model.transform(X[0, :])) + assert str(raised_error.value) == err_msg # we test that the shape is also OK when doing dimensionality reduction if type(model).__name__ in {'LFDA', 'MLKR', 'NCA', 'RCA'}: model.set_params(num_dims=2) model.fit(inputs, labels) assert model.transform(X).shape == (X.shape[0], 2) # assert that ValueError is thrown if input shape is 1D - with pytest.raises(ValueError, message=err_msg): + with pytest.raises(ValueError) as raised_error: model.transform(model.transform(X[0, :])) + assert str(raised_error.value) == err_msg @pytest.mark.parametrize('estimator, build_dataset', list_estimators, diff --git a/test/test_utils.py b/test/test_utils.py index 96fdc24a..4643e302 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -6,16 +6,19 @@ from sklearn.utils import check_random_state, shuffle from sklearn.utils.testing import set_random_state from sklearn.base import clone -from metric_learn._util import (check_tuples, make_context, check_points, - preprocess_tuples, make_name, - preprocess_points) +from metric_learn._util import (check_input, make_context, preprocess_tuples, + make_name, preprocess_points) from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised) from sklearn.datasets import make_regression, make_blobs +def mock_preprocessor(indices): + """A preprocessor for testing purposes that returns an all ones 3D array + """ + return np.ones((indices.shape[0], 3)) -# ---------------------------- test check_tuples ---------------------------- +# ---------------------------- test check_input ---------------------------- @pytest.fixture def tuples_prep(): @@ -50,16 +53,18 @@ def test_make_name(estimator, expected): @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_tuples, preprocessor', - [(tuples_prep, True), (tuples_no_prep, False), - (tuples_no_prep, True)]) -def test_check_tuples_invalid_t(estimator, context, load_tuples, preprocessor): + [(tuples_prep, mock_preprocessor), + (tuples_no_prep, None), + (tuples_no_prep, mock_preprocessor)]) +def test_check_input_invalid_t(estimator, context, load_tuples, preprocessor): """Checks that the exception are raised if t is not the one expected""" tuples = load_tuples() expected_msg = ("Tuples of 3 element(s) expected{}. Got tuples of 2 " "element(s) instead (shape={}):\ninput={}.\n" .format(context, tuples.shape, tuples)) with pytest.raises(ValueError) as raised_error: - check_tuples(tuples, t=3, preprocessor=preprocessor, estimator=estimator) + check_input(tuples, type_of_inputs='tuples', t=3, + preprocessor=preprocessor, estimator=estimator) assert str(raised_error.value) == expected_msg @@ -67,17 +72,17 @@ def test_check_tuples_invalid_t(estimator, context, load_tuples, preprocessor): [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('tuples, found, expected, preprocessor', [(5, '0', '2D array of indicators or 3D array of ' - 'formed tuples', True), - (5, '0', '3D array of formed tuples', False), + 'formed tuples', mock_preprocessor), + (5, '0', '3D array of formed tuples', None), ([1, 2], '1', '2D array of indicators or 3D array ' - 'of formed tuples', True), - ([1, 2], '1', '3D array of formed tuples', False), + 'of formed tuples', mock_preprocessor), + ([1, 2], '1', '3D array of formed tuples', None), ([[[[5]]]], '4', '2D array of indicators or 3D array' - ' of formed tuples', True), - ([[[[5]]]], '4', '3D array of formed tuples', False), + ' of formed tuples', mock_preprocessor), + ([[[[5]]]], '4', '3D array of formed tuples', None), ([[1], [3]], '2', '3D array of formed ' - 'tuples', False)]) -def test_check_tuples_invalid_shape(estimator, context, tuples, found, + 'tuples', None)]) +def test_check_input_invalid_shape(estimator, context, tuples, found, expected, preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (not 2D with preprocessor or 3D with no preprocessor) @@ -89,14 +94,15 @@ def test_check_tuples_invalid_shape(estimator, context, tuples, found, if preprocessor else '', found, tuples, ' and/or use a preprocessor' if not preprocessor else '')) with pytest.raises(ValueError) as raised_error: - check_tuples(tuples, preprocessor=preprocessor, ensure_min_samples=0, - estimator=estimator) + check_input(tuples, type_of_inputs='tuples', + preprocessor=preprocessor, ensure_min_samples=0, + estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -def test_check_tuples_invalid_n_features(estimator, context, tuples_no_prep): +def test_check_input_invalid_n_features(estimator, context, tuples_no_prep): """Checks that the right warning is printed if not enough features Here we only test if no preprocessor (otherwise we don't ensure this) """ @@ -104,34 +110,38 @@ def test_check_tuples_invalid_n_features(estimator, context, tuples_no_prep): " a minimum of 3 is required{}.".format(tuples_no_prep.shape, context)) with pytest.raises(ValueError) as raised_error: - check_tuples(tuples_no_prep, preprocessor=False, ensure_min_features=3, - estimator=estimator) + check_input(tuples_no_prep, type_of_inputs='tuples', + preprocessor=None, ensure_min_features=3, + estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_tuples, preprocessor', - [(tuples_prep, True), (tuples_no_prep, False), - (tuples_no_prep, True)]) -def test_check_tuples_invalid_n_samples(estimator, context, load_tuples, + [(tuples_prep, mock_preprocessor), + (tuples_no_prep, None), + (tuples_no_prep, mock_preprocessor)]) +def test_check_input_invalid_n_samples(estimator, context, load_tuples, preprocessor): """Checks that the right warning is printed if n_samples is too small""" tuples = load_tuples() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " "is required{}.".format(tuples.shape, context)) with pytest.raises(ValueError) as raised_error: - check_tuples(tuples, preprocessor=preprocessor, ensure_min_samples=3, - estimator=estimator) + check_input(tuples, type_of_inputs='tuples', + preprocessor=preprocessor, + ensure_min_samples=3, estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_tuples, preprocessor', - [(tuples_prep, True), (tuples_no_prep, False), - (tuples_no_prep, True)]) -def test_check_tuples_invalid_dtype_convertible(estimator, context, + [(tuples_prep, mock_preprocessor), + (tuples_no_prep, None), + (tuples_no_prep, mock_preprocessor)]) +def test_check_input_invalid_dtype_convertible(estimator, context, load_tuples, preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" @@ -139,33 +149,37 @@ def test_check_tuples_invalid_dtype_convertible(estimator, context, msg = ("Data with input dtype object was converted to float64{}." .format(context)) with pytest.warns(DataConversionWarning) as raised_warning: - check_tuples(tuples, preprocessor=preprocessor, dtype=np.float64, - warn_on_dtype=True, estimator=estimator) + check_input(tuples, type_of_inputs='tuples', + preprocessor=preprocessor, dtype=np.float64, + warn_on_dtype=True, estimator=estimator) assert str(raised_warning[0].message) == msg @pytest.mark.parametrize('preprocessor, tuples', - [(True, np.array([['a', 'b'], - ['e', 'b']])), - (False, np.array([[['b', 'v'], ['a', 'd']], + [(mock_preprocessor, np.array([['a', 'b'], + ['e', 'b']])), + (None, np.array([[['b', 'v'], ['a', 'd']], [['x', 'u'], ['c', 'a']]]))]) -def test_check_tuples_invalid_dtype_not_convertible(preprocessor, tuples): +def test_check_input_invalid_dtype_not_convertible(preprocessor, tuples): """Checks that a value error is thrown if attempting to convert an input not convertible to float """ with pytest.raises(ValueError): - check_tuples(tuples, preprocessor=preprocessor, dtype=np.float64) + check_input(tuples, type_of_inputs='tuples', + preprocessor=preprocessor, dtype=np.float64) @pytest.mark.parametrize('t', [2, None]) -def test_check_tuples_valid_t(t, tuples_prep, tuples_no_prep): +def test_check_input_valid_t(t, tuples_prep, tuples_no_prep): """For inputs that have the right matrix dimension (2D or 3D for instance), checks that checking the number of tuples (pairs, quadruplets, etc) raises no warning """ with pytest.warns(None) as record: - check_tuples(tuples_prep, preprocessor=True, t=t) - check_tuples(tuples_no_prep, preprocessor=False, t=t) + check_input(tuples_prep, type_of_inputs='tuples', + preprocessor=mock_preprocessor, t=t) + check_input(tuples_no_prep, type_of_inputs='tuples', preprocessor=None, + t=t) assert len(record) == 0 @@ -184,10 +198,11 @@ def test_check_tuples_valid_t(t, tuples_prep, tuples_no_prep): (1, 4, 9)), np.array([[[1.2, 2.2], [1.4, 3.3]], [[2.6, 2.3], [3.4, 5.0]]])]) -def test_check_tuples_valid_with_preprocessor(tuples): +def test_check_input_valid_with_preprocessor(tuples): """Test that valid inputs when using a preprocessor raises no warning""" with pytest.warns(None) as record: - check_tuples(tuples, preprocessor=True) + check_input(tuples, type_of_inputs='tuples', + preprocessor=mock_preprocessor) assert len(record) == 0 @@ -204,23 +219,24 @@ def test_check_tuples_valid_with_preprocessor(tuples): (((2, 1), (0, 2), (2, 3)), ((1, 2), (4, 4), (9, 3)), ((3, 1), (4, 4), (29, 4)))]) -def test_check_tuples_valid_without_preprocessor(tuples): +def test_check_input_valid_without_preprocessor(tuples): """Test that valid inputs when using no preprocessor raises no warning""" with pytest.warns(None) as record: - check_tuples(tuples, preprocessor=False) + check_input(tuples, preprocessor=None) assert len(record) == 0 -def test_check_tuples_behaviour_auto_dtype(tuples_no_prep): +def test_check_input_behaviour_auto_dtype(tuples_no_prep): """Checks that check_tuples allows by default every type if using a preprocessor, and numeric types if using no preprocessor""" tuples_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']] with pytest.warns(None) as record: - check_tuples(tuples_prep, preprocessor=True) + check_input(tuples_prep, type_of_inputs='tuples', + preprocessor=mock_preprocessor) assert len(record) == 0 with pytest.warns(None) as record: - check_tuples(tuples_no_prep) # numeric type + check_input(tuples_no_prep, type_of_inputs='tuples') # numeric type assert len(record) == 0 # not numeric type @@ -228,10 +244,10 @@ def test_check_tuples_behaviour_auto_dtype(tuples_no_prep): [['img3.png'], ['img5.png']]]) tuples_no_prep = tuples_no_prep.astype(object) with pytest.raises(ValueError): - check_tuples(tuples_no_prep) + check_input(tuples_no_prep, type_of_inputs='tuples') -def test_check_tuples_invalid_complex_data(): +def test_check_input_invalid_complex_data(): """Checks that the right error message is thrown if given complex data ( this comes from sklearn's check_array's message)""" tuples = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], @@ -239,11 +255,12 @@ def test_check_tuples_invalid_complex_data(): msg = ("Complex data not supported\n" "{}\n".format(tuples)) with pytest.raises(ValueError) as raised_error: - check_tuples(tuples) + check_input(tuples, type_of_inputs='tuples') assert str(raised_error.value) == msg -# ---------------------------- test check_points ---------------------------- +# ---------------------------- test check_input with points type +# ---------------------------- @pytest.fixture @@ -265,13 +282,14 @@ def points_no_prep(): [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('points, found, expected, preprocessor', [(5, '0', '1D array of indicators or 2D array of ' - 'formed points', True), - (5, '0', '2D array of formed points', False), - ([1, 2], '1', '2D array of formed points', False), + 'formed points', mock_preprocessor), + (5, '0', '2D array of formed points', None), + ([1, 2], '1', '2D array of formed points', None), ([[[5]]], '3', '1D array of indicators or 2D ' - 'array of formed points', True), - ([[[5]]], '3', '2D array of formed points', False)]) -def test_check_points_invalid_shape(estimator, context, points, found, + 'array of formed points', + mock_preprocessor), + ([[[5]]], '3', '2D array of formed points', None)]) +def test_check_input_points_invalid_shape(estimator, context, points, found, expected, preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (valid being 1D or 2D with preprocessor or 2D with no @@ -284,14 +302,15 @@ def test_check_points_invalid_shape(estimator, context, points, found, if preprocessor else '', found, points, ' and/or use a preprocessor' if not preprocessor else '')) with pytest.raises(ValueError) as raised_error: - check_points(points, preprocessor=preprocessor, ensure_min_samples=0, - estimator=estimator) + check_input(points, type_of_inputs='classic', preprocessor=preprocessor, + ensure_min_samples=0, + estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -def test_check_points_invalid_n_features(estimator, context, points_no_prep): +def test_check_input_invalid_n_features(estimator, context, points_no_prep): """Checks that the right warning is printed if not enough features Here we only test if no preprocessor (otherwise we don't ensure this) """ @@ -299,7 +318,8 @@ def test_check_points_invalid_n_features(estimator, context, points_no_prep): " a minimum of 3 is required{}.".format(points_no_prep.shape, context)) with pytest.raises(ValueError) as raised_error: - check_points(points_no_prep, preprocessor=False, ensure_min_features=3, + check_input(points_no_prep, type_of_inputs='classic', preprocessor=None, + ensure_min_features=3, estimator=estimator) assert str(raised_error.value) == msg @@ -307,26 +327,29 @@ def test_check_points_invalid_n_features(estimator, context, points_no_prep): @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_points, preprocessor', - [(points_prep, True), (points_no_prep, False), - (points_no_prep, True)]) -def test_check_points_invalid_n_samples(estimator, context, load_points, + [(points_prep, mock_preprocessor), + (points_no_prep, None), + (points_no_prep, mock_preprocessor)]) +def test_check_input_point_invalid_n_samples(estimator, context, load_points, preprocessor): """Checks that the right warning is printed if n_samples is too small""" points = load_points() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " "is required{}.".format(points.shape, context)) with pytest.raises(ValueError) as raised_error: - check_points(points, preprocessor=preprocessor, ensure_min_samples=3, - estimator=estimator) + check_input(points, type_of_inputs='classic',preprocessor=preprocessor, + ensure_min_samples=3, + estimator=estimator) assert str(raised_error.value) == msg @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) @pytest.mark.parametrize('load_points, preprocessor', - [(points_prep, True), (points_no_prep, False), - (points_no_prep, True)]) -def test_check_points_invalid_dtype_convertible(estimator, context, + [(points_prep, mock_preprocessor), + (points_no_prep, None), + (points_no_prep, mock_preprocessor)]) +def test_check_input_point_invalid_dtype_convertible(estimator, context, load_points, preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" @@ -334,22 +357,24 @@ def test_check_points_invalid_dtype_convertible(estimator, context, msg = ("Data with input dtype object was converted to float64{}." .format(context)) with pytest.warns(DataConversionWarning) as raised_warning: - check_points(points, preprocessor=preprocessor, dtype=np.float64, - warn_on_dtype=True, estimator=estimator) + check_input(points, type_of_inputs='classic', + preprocessor=preprocessor, dtype=np.float64, + warn_on_dtype=True, estimator=estimator) assert str(raised_warning[0].message) == msg @pytest.mark.parametrize('preprocessor, points', - [(True, np.array([['a', 'b'], - ['e', 'b']])), - (False, np.array([[['b', 'v'], ['a', 'd']], - [['x', 'u'], ['c', 'a']]]))]) -def test_check_points_invalid_dtype_not_convertible(preprocessor, points): + [(mock_preprocessor, np.array([['a', 'b'], + ['e', 'b']])), + (None, np.array([[['b', 'v'], ['a', 'd']], + [['x', 'u'], ['c', 'a']]]))]) +def test_check_input_point_invalid_dtype_not_convertible(preprocessor, points): """Checks that a value error is thrown if attempting to convert an input not convertible to float """ with pytest.raises(ValueError): - check_points(points, preprocessor=preprocessor, dtype=np.float64) + check_input(points, type_of_inputs='classic', + preprocessor=preprocessor, dtype=np.float64) @pytest.mark.parametrize('points', @@ -361,10 +386,11 @@ def test_check_points_invalid_dtype_not_convertible(preprocessor, points): (2, 0, 2), np.array([[1.2, 2.2], [2.6, 2.3]])]) -def test_check_points_valid_with_preprocessor(points): +def test_check_input_point_valid_with_preprocessor(points): """Test that valid inputs when using a preprocessor raises no warning""" with pytest.warns(None) as record: - check_points(points, preprocessor=True) + check_input(points, type_of_inputs='classic', + preprocessor=mock_preprocessor) assert len(record) == 0 @@ -381,23 +407,24 @@ def test_check_points_valid_with_preprocessor(points): ((2, 1, 0, 2, 2, 3), (1, 2, 4, 4, 9, 3), (3, 1, 4, 4, 29, 4))]) -def test_check_points_valid_without_preprocessor(points): +def test_check_input_point_valid_without_preprocessor(points): """Test that valid inputs when using no preprocessor raises no warning""" with pytest.warns(None) as record: - check_points(points, preprocessor=False) + check_input(points, type_of_inputs='classic', preprocessor=None) assert len(record) == 0 -def test_check_points_behaviour_auto_dtype(points_no_prep): - """Checks that check_points allows by default every type if using a - preprocessor, and numeric types if using no preprocessor""" +def test_check_input_point_behaviour_auto_dtype(points_no_prep): + """Checks that check_input (for points) allows by default every type if + using a preprocessor, and numeric types if using no preprocessor""" points_prep = ['img1.png', 'img2.png', 'img3.png', 'img5.png'] with pytest.warns(None) as record: - check_points(points_prep, preprocessor=True) + check_input(points_prep, type_of_inputs='classic', + preprocessor=mock_preprocessor) assert len(record) == 0 with pytest.warns(None) as record: - check_points(points_no_prep) # numeric type + check_input(points_no_prep, type_of_inputs='input') # numeric type assert len(record) == 0 # not numeric type @@ -405,10 +432,10 @@ def test_check_points_behaviour_auto_dtype(points_no_prep): 'img5.png']) points_no_prep = points_no_prep.astype(object) with pytest.raises(ValueError): - check_points(points_no_prep) + check_input(points_no_prep, type_of_inputs='classic') -def test_check_points_invalid_complex_data(): +def test_check_input_point_invalid_complex_data(): """Checks that the right error message is thrown if given complex data ( this comes from sklearn's check_array's message)""" points = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], @@ -416,7 +443,7 @@ def test_check_points_invalid_complex_data(): msg = ("Complex data not supported\n" "{}\n".format(points)) with pytest.raises(ValueError) as raised_error: - check_points(points) + check_input(points, type_of_inputs='classic') assert str(raised_error.value) == msg @@ -527,8 +554,8 @@ def preprocessor(sequence): return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D with pytest.raises(ValueError) as raised_error: - preprocess_tuples(np.ones((3, 2)), - estimator=estimator, preprocessor=preprocessor) + check_input(np.ones((3, 2)), type_of_inputs='tuples', + preprocessor=preprocessor) expected_msg = ("3D array of formed tuples expected{}. Found 4D " "array instead:\ninput={}. Reshape your data{}.\n" .format(context, np.ones((3, 2, 2, 2)), @@ -553,8 +580,7 @@ def preprocessor(sequence): return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D with pytest.raises(ValueError) as raised_error: - preprocess_points(np.ones((3,)), - estimator=estimator, preprocessor=preprocessor) + preprocess_points(np.ones((3,)), preprocessor=preprocessor) expected_msg = ("2D array of formed points expected{}. " "Found 3D array instead:\ninput={}. Reshape your data{}.\n" .format(context, np.ones((3, 2, 2)), diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 98007423..116af5db 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -19,6 +19,12 @@ RNG = check_random_state(0) +def mock_preprocessor(indices): + """A preprocessor for testing purposes that returns an all ones 3D array + """ + return np.ones((indices.shape[0], 3)) + + # ---------------------- Test scikit-learn compatibility ---------------------- @@ -287,7 +293,7 @@ def test_dict_unchanged(estimator, build_dataset, preprocessor): ids=['itml', 'lsml', 'mmc', 'sdml']) def test_same_result_with_or_without_preprocessor(estimator, build_dataset): (X, tuples, y, tuples_train, tuples_test, y_train, - y_test, _) = build_dataset(preprocessor=True) + y_test, _) = build_dataset(preprocessor=mock_preprocessor) formed_tuples_train = X[tuples_train] formed_tuples_test = X[tuples_test] From 27e215c281d1817fbee82caf48fa8b78e1ec05b5 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 13:40:50 +0200 Subject: [PATCH 053/120] Fixes some tests: - fixes dtype checks and conversion by ensuring the checks and conversions are made on the preprocessed array and not the input array (which can be an array of indices) - fix tests that were failing due to the wrong error message --- metric_learn/_util.py | 80 ++++++++++++++++++++++++++------------- test/test_utils.py | 87 +++++++++++++++++++++++++++++-------------- 2 files changed, 115 insertions(+), 52 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 516eb135..56eaf27d 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -72,15 +72,18 @@ def check_input(input, y=None, preprocessor=None, multi_output=multi_output, y_numeric=y_numeric) # we try to allow the more possible stuff here + preprocessor_has_been_applied = False if type_of_inputs == 'classic': if input.ndim == 1: if preprocessor is not None: input = preprocess_points(input, preprocessor) + preprocessor_has_been_applied = True else: - raise ValueError('Using no preprocessor, a 2D array of formed ' - 'points is expected{}. Reshape your data ' - 'and/or use a preprocessor'.format(context)) + raise ValueError('2D array of formed points expected{}. Found {}D array ' + 'instead:\ninput={}. Reshape ' + 'your data and/or use a preprocessor.\n'.format( + context, input.ndim, input)) elif input.ndim == 2: pass # OK else: @@ -93,30 +96,40 @@ def check_input(input, y=None, preprocessor=None, .format(with_prep[0], context, with_prep[1], input.ndim, input)) - input = check_array(input, allow_nd=True, ensure_2d=True, + input = check_array(input, allow_nd=True, ensure_2d=False, # arguments that come from before **args_for_sk_checks) + if input.ndim != 2: # we have to ensure this because check_array above + # does not + + raise ValueError('2D array of formed points expected{}{}. ' + 'Found {}D array ' + 'instead:\ninput={}. Reshape your data ' + 'and/or use a preprocessor.\n'.format(context, + ' after the preprocessor has been applied' if + preprocessor_has_been_applied else '', + input.ndim, input)) elif type_of_inputs == 'tuples': - if input.ndim == 2: - if preprocessor is not None: - check_t(input, t, context) - input = preprocess_points(input, preprocessor) - else: + check_t(input, t, context) - raise ValueError('Using no preprocessor, a 3D array of points ' - 'indicators is expected{}. Reshape your data ' - 'and/or use a preprocessor'.format(context)) + if input.ndim == 2: + if preprocessor is not None: + input = preprocess_tuples(input, preprocessor) + preprocessor_has_been_applied = True + else: + + raise ValueError('3D array of formed tuples expected{}. ' + 'Found {}D array ' + 'instead:\ninput={}. ' + 'Reshape your data ' + 'and/or use a preprocessor.\n'.format(context, + input.ndim, + input)) elif input.ndim == 3: # we should check_num_features which is not checked - # after - if ensure_min_features > 0: - n_features = input.shape[2] - if n_features < ensure_min_features: - raise ValueError("Found array with {} feature(s) (shape={}) while" - " a minimum of {} is required{}." - .format(n_features, input.shape, - ensure_min_features, context)) + # after + pass else: with_prep =(('2D array of indicators or 3D array of formed tuples', @@ -128,26 +141,43 @@ def check_input(input, y=None, preprocessor=None, .format(with_prep[0], context, with_prep[1], input.ndim, input)) - check_t(input, t, context) input = check_array(input, allow_nd=True, ensure_2d=False, - # arguments that come from before - **args_for_sk_checks) + # arguments that come from before + **args_for_sk_checks) + if ensure_min_features > 0: + n_features = input.shape[2] + if n_features < ensure_min_features: + raise ValueError("Found array with {} feature(s) (shape={}) while" + " a minimum of {} is required{}." + .format(n_features, input.shape, + ensure_min_features, context)) + # normally we don't need to check_t too because t should'nt be able to + # be modified by any preprocessor if input.ndim != 3: # we have to ensure this because check_array above # does not - raise ValueError("Invalid data.") # TODO put a better error message + + raise ValueError('3D array of formed tuples expected{}{}. ' + 'Found {}D array ' + 'instead:\ninput={}. Reshape your data ' + 'and/or use a preprocessor.\n'.format(context, + ' after the preprocessor has been applied' if + preprocessor_has_been_applied else '', + input.ndim, input)) return input if y is None else (input, y) def preprocess_tuples(tuples, preprocessor): + print("Preprocessing tuples...") tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for - i in range(tuples.shape[1])]) + i in range(tuples.shape[1])]) return tuples def preprocess_points(points, preprocessor): """form points if there is a preprocessor else keep them as such (assumes that check_points has already been called)""" + print("Preprocessing points...") points = preprocessor(points) return points diff --git a/test/test_utils.py b/test/test_utils.py index 4643e302..19fffd41 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -92,7 +92,8 @@ def test_check_input_invalid_shape(estimator, context, tuples, found, "data{}.\n" .format(expected, context, ' when using a preprocessor' if preprocessor else '', found, tuples, - ' and/or use a preprocessor' if not preprocessor else '')) + ' and/or use a preprocessor' if + (not preprocessor and tuples.ndim == 2) else '')) with pytest.raises(ValueError) as raised_error: check_input(tuples, type_of_inputs='tuples', preprocessor=preprocessor, ensure_min_samples=0, @@ -127,7 +128,10 @@ def test_check_input_invalid_n_samples(estimator, context, load_tuples, """Checks that the right warning is printed if n_samples is too small""" tuples = load_tuples() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " - "is required{}.".format(tuples.shape, context)) + "is required{}.".format((preprocess_tuples(tuples, preprocessor) + if (preprocessor is not None and + tuples.ndim == 2) else tuples).shape, + context)) with pytest.raises(ValueError) as raised_error: check_input(tuples, type_of_inputs='tuples', preprocessor=preprocessor, @@ -145,7 +149,16 @@ def test_check_input_invalid_dtype_convertible(estimator, context, load_tuples, preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" - tuples = load_tuples().astype(object) + tuples = load_tuples().astype(object) # here the object conversion is + # useless for the tuples_prep case, but this allows to test the + # tuples_prep case + + if preprocessor is not None: # if the preprocessor is not None we + # overwrite it to have a preprocessor that returns objects + def preprocessor(indices): # + # preprocessor that returns objects + return np.ones((indices.shape[0], 3)).astype(object) + msg = ("Data with input dtype object was converted to float64{}." .format(context)) with pytest.warns(DataConversionWarning) as raised_warning: @@ -155,20 +168,32 @@ def test_check_input_invalid_dtype_convertible(estimator, context, assert str(raised_warning[0].message) == msg -@pytest.mark.parametrize('preprocessor, tuples', - [(mock_preprocessor, np.array([['a', 'b'], - ['e', 'b']])), - (None, np.array([[['b', 'v'], ['a', 'd']], - [['x', 'u'], ['c', 'a']]]))]) -def test_check_input_invalid_dtype_not_convertible(preprocessor, tuples): +def test_check_input_invalid_dtype_not_convertible_with_preprocessor( + tuples_prep): """Checks that a value error is thrown if attempting to convert an - input not convertible to float + input not convertible to float, when using a preprocessor """ + + def preprocessor(indices): + # preprocessor that returns objects + return np.full((indices.shape[0], 3), 'a') + with pytest.raises(ValueError): - check_input(tuples, type_of_inputs='tuples', + check_input(tuples_prep, type_of_inputs='tuples', preprocessor=preprocessor, dtype=np.float64) +def test_check_input_invalid_dtype_not_convertible_without_preprocessor( + tuples_no_prep): + """Checks that a value error is thrown if attempting to convert an + input not convertible to float, when using no preprocessor + """ + tuples = np.full_like(tuples_no_prep, 'a', dtype=object) + with pytest.raises(ValueError): + check_input(tuples, type_of_inputs='tuples', + preprocessor=None, dtype=np.float64) + + @pytest.mark.parametrize('t', [2, None]) def test_check_input_valid_t(t, tuples_prep, tuples_no_prep): """For inputs that have the right matrix dimension (2D or 3D for instance), @@ -222,7 +247,7 @@ def test_check_input_valid_with_preprocessor(tuples): def test_check_input_valid_without_preprocessor(tuples): """Test that valid inputs when using no preprocessor raises no warning""" with pytest.warns(None) as record: - check_input(tuples, preprocessor=None) + check_input(tuples, type_of_inputs='tuples', preprocessor=None) assert len(record) == 0 @@ -300,7 +325,8 @@ def test_check_input_points_invalid_shape(estimator, context, points, found, "data{}.\n" .format(expected, context, ' when using a preprocessor' if preprocessor else '', found, points, - ' and/or use a preprocessor' if not preprocessor else '')) + ' and/or use a preprocessor' if + (not preprocessor and points.ndim == 1) else '')) with pytest.raises(ValueError) as raised_error: check_input(points, type_of_inputs='classic', preprocessor=preprocessor, ensure_min_samples=0, @@ -335,7 +361,12 @@ def test_check_input_point_invalid_n_samples(estimator, context, load_points, """Checks that the right warning is printed if n_samples is too small""" points = load_points() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " - "is required{}.".format(points.shape, context)) + "is required{}.".format((preprocess_points(points, + preprocessor) + if preprocessor is not None and + points.ndim == 1 else + points).shape, + context)) with pytest.raises(ValueError) as raised_error: check_input(points, type_of_inputs='classic',preprocessor=preprocessor, ensure_min_samples=3, @@ -353,7 +384,16 @@ def test_check_input_point_invalid_dtype_convertible(estimator, context, load_points, preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" - points = load_points().astype(object) + points = load_points().astype(object) # here the object conversion is + # useless for the points_prep case, but this allows to test the + # points_prep case + + if preprocessor is not None: # if the preprocessor is not None we + # overwrite it to have a preprocessor that returns objects + def preprocessor(indices): # + # preprocessor that returns objects + return np.ones((indices.shape[0], 3)).astype(object) + msg = ("Data with input dtype object was converted to float64{}." .format(context)) with pytest.warns(DataConversionWarning) as raised_warning: @@ -543,19 +583,15 @@ def test_preprocess_tuples_invalid_message(estimator): """Checks that if the preprocessor does some weird stuff, the preprocessed input is detected as weird. Checks this for preprocess_tuples.""" - if estimator is not None: - estimator_name = make_name(estimator) + (' after the preprocessor ' + context = make_context(estimator) + (' after the preprocessor ' 'has been applied') - else: - estimator_name = ('objects that will use preprocessed tuples') - context = make_context(estimator_name) def preprocessor(sequence): return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D with pytest.raises(ValueError) as raised_error: check_input(np.ones((3, 2)), type_of_inputs='tuples', - preprocessor=preprocessor) + preprocessor=preprocessor, estimator=estimator) expected_msg = ("3D array of formed tuples expected{}. Found 4D " "array instead:\ninput={}. Reshape your data{}.\n" .format(context, np.ones((3, 2, 2, 2)), @@ -569,18 +605,15 @@ def test_preprocess_points_invalid_message(estimator): """Checks that if the preprocessor does some weird stuff, the preprocessed input is detected as weird. Checks this for preprocess_points.""" - if estimator is not None: - estimator_name = make_name(estimator) + (' after the preprocessor ' + context = make_context(estimator) + (' after the preprocessor ' 'has been applied') - else: - estimator_name = ('objects that will use preprocessed points') - context = make_context(estimator_name) def preprocessor(sequence): return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D with pytest.raises(ValueError) as raised_error: - preprocess_points(np.ones((3,)), preprocessor=preprocessor) + check_input(np.ones((3,)), type_of_inputs='classic', + preprocessor=preprocessor, estimator=estimator) expected_msg = ("2D array of formed points expected{}. " "Found 3D array instead:\ninput={}. Reshape your data{}.\n" .format(context, np.ones((3, 2, 2)), From e23554f39f5ba5dcec79f7085dc9d084606cfe63 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 14:05:06 +0200 Subject: [PATCH 054/120] TST: Cherry pick from new sklearn version ac0e230000556b7c413e08b77d893bbaccbdfbcf [MRG] Quick fix of failed tests due to new scikit-learn version (0.20.0) (#130) * TST: Quick fix of failed tests due to new scikit-learn version (0.20.0) * FIX update values to pass test --- metric_learn/itml.py | 2 +- metric_learn/lmnn.py | 2 +- metric_learn/lsml.py | 2 +- metric_learn/mmc.py | 2 +- test/metric_learn_test.py | 14 +++++++------- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 61af88d7..fb8e32c5 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -233,7 +233,7 @@ def fit(self, X, y, random_state=np.random): self.check_preprocessor() X, y = check_input(X, y, type_of_inputs='classic', preprocessor=self.preprocessor_, - estimator=self) + estimator=self, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 343a3a22..81fbee3e 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -53,7 +53,7 @@ def _process_inputs(self, X, labels): self.check_preprocessor() self.X_, labels = check_input(X, labels, type_of_inputs='classic', preprocessor=self.preprocessor_, - estimator=self) + estimator=self, ensure_min_samples=2) self.X_ = self.X_.astype(float) # todo: remove the conversion here and # integrate it into check_input num_pts, num_dims = self.X_.shape diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 352a6229..3e003063 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -221,7 +221,7 @@ def fit(self, X, y, random_state=np.random): self.check_preprocessor() X, y = check_input(X, y, type_of_inputs='classic', preprocessor=self.preprocessor_, - estimator=self) + estimator=self, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index fcbf8ef5..71606e10 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -455,7 +455,7 @@ def fit(self, X, y, random_state=np.random): self.check_preprocessor() X, y = check_input(X, y, type_of_inputs='classic', preprocessor=self.preprocessor_, - estimator=self) + estimator=self, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index aad076a0..d752298b 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -40,7 +40,7 @@ def test_iris(self): csep = class_separation(cov.transform(self.iris_points), self.iris_labels) # deterministic result - self.assertAlmostEqual(csep, 0.73068122) + self.assertAlmostEqual(csep, 0.72981476) class TestLSML(MetricTestCase): @@ -104,7 +104,7 @@ def test_iris(self): nca = NCA(max_iter=(100000//n), learning_rate=0.01, num_dims=2) nca.fit(self.iris_points, self.iris_labels) csep = class_separation(nca.transform(self.iris_points), self.iris_labels) - self.assertLess(csep, 0.15) + self.assertLess(csep, 0.20) class TestLFDA(MetricTestCase): @@ -163,16 +163,16 @@ def test_iris(self): # Full metric mmc = MMC(convergence_threshold=0.01) mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d])) - expected = [[+0.00046504, +0.00083371, -0.00111959, -0.00165265], - [+0.00083371, +0.00149466, -0.00200719, -0.00296284], - [-0.00111959, -0.00200719, +0.00269546, +0.00397881], - [-0.00165265, -0.00296284, +0.00397881, +0.00587320]] + expected = [[ 0.000514, 0.000868, -0.001195, -0.001703], + [ 0.000868, 0.001468, -0.002021, -0.002879], + [-0.001195, -0.002021, 0.002782, 0.003964], + [-0.001703, -0.002879, 0.003964, 0.005648]] assert_array_almost_equal(expected, mmc.metric(), decimal=6) # Diagonal metric mmc = MMC(diagonal=True) mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d])) - expected = [0, 0, 1.21045968, 1.22552608] + expected = [0, 0, 1.210220, 1.228596] assert_array_almost_equal(np.diag(expected), mmc.metric(), decimal=6) # Supervised Full From 9ded84673f3d7719d12e97988c443afd9f32af89 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 14:40:38 +0200 Subject: [PATCH 055/120] FIX: get changes from master to pass test iris for NCA --- test/metric_learn_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index d752298b..a2f5461c 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -90,7 +90,7 @@ def test_iris(self): n = self.iris_points.shape[0] # Without dimension reduction - nca = NCA(max_iter=(100000//n), learning_rate=0.01) + nca = NCA(max_iter=(100000//n), num_dims=2, tol=1e-9) nca.fit(self.iris_points, self.iris_labels) # Result copied from Iris example at # https://github.com/vomjom/nca/blob/master/README.mkd From 50514bcdb80b6f482b4438a2a13d1febe74d8be7 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 15:15:51 +0200 Subject: [PATCH 056/120] FIX fix tests that were failing due to the error message FIX check_tuples at the end and only at the end, because the number of tuples should not be modified from the beginning, and we want to check them also at the end --- metric_learn/_util.py | 5 ++--- test/test_mahalanobis_mixin.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 56eaf27d..7578b23c 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -111,9 +111,6 @@ def check_input(input, y=None, preprocessor=None, input.ndim, input)) elif type_of_inputs == 'tuples': - - check_t(input, t, context) - if input.ndim == 2: if preprocessor is not None: input = preprocess_tuples(input, preprocessor) @@ -164,6 +161,8 @@ def check_input(input, y=None, preprocessor=None, preprocessor_has_been_applied else '', input.ndim, input)) + check_t(input, t, context) + return input if y is None else (input, y) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 43d6199e..889fef85 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -11,6 +11,7 @@ from metric_learn import (Constraints, ITML, LSML, MMC, SDML, Covariance, LFDA, LMNN, MLKR, NCA, RCA) +from metric_learn._util import make_context from metric_learn.constraints import wrap_pairs from functools import partial @@ -141,11 +142,10 @@ def test_score_pairs_dim(estimator, build_dataset): X, _ = load_iris(return_X_y=True) tuples = np.array(list(product(X, X))) assert model.score_pairs(tuples).shape == (tuples.shape[0],) - msg = ("Expected 3D array, got 2D array instead:\ntuples={}.\n" - "Reshape your data either using tuples.reshape(-1, {}, 1) if " - "your data has a single feature or tuples.reshape(1, {}, -1) " - "if it contains a single tuple.".format(tuples, tuples.shape[1], - tuples.shape[0])) + context = make_context(estimator) + msg = ("3D array of formed tuples expected{}. Found 2D array " + "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n" + .format(context, tuples[1])) with pytest.raises(ValueError) as raised_error: model.score_pairs(tuples[1]) assert str(raised_error.value) == msg @@ -188,10 +188,10 @@ def test_embed_dim(estimator, build_dataset): assert model.transform(X).shape == X.shape # assert that ValueError is thrown if input shape is 1D - err_msg = ("Expected 2D array, got 1D array instead:\narray={}.\n" - "Reshape your data either using array.reshape(-1, 1) if " - "your data has a single feature or array.reshape(1, -1) " - "if it contains a single sample.".format(X)) + context = make_context(estimator) + err_msg = ("2D array of formed points expected{}. Found 1D array " + "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n" + .format(context, X[0])) with pytest.raises(ValueError) as raised_error: model.score_pairs(model.transform(X[0, :])) assert str(raised_error.value) == err_msg From 96b58b4df094cd4de88b38d8bb4750fa76399130 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 15:21:51 +0200 Subject: [PATCH 057/120] TST: fix test_check_input_invalid_t that changed since we test t at the end now --- test/test_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 19fffd41..c0f4920c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -59,9 +59,13 @@ def test_make_name(estimator, expected): def test_check_input_invalid_t(estimator, context, load_tuples, preprocessor): """Checks that the exception are raised if t is not the one expected""" tuples = load_tuples() + preprocessed_tuples = (preprocess_tuples(tuples, preprocessor) + if (preprocessor is not None and + tuples.ndim == 2) else tuples) expected_msg = ("Tuples of 3 element(s) expected{}. Got tuples of 2 " "element(s) instead (shape={}):\ninput={}.\n" - .format(context, tuples.shape, tuples)) + .format(context, preprocessed_tuples.shape, + preprocessed_tuples)) with pytest.raises(ValueError) as raised_error: check_input(tuples, type_of_inputs='tuples', t=3, preprocessor=preprocessor, estimator=estimator) From 9cab2eec0fd3bd91b4ae26dc17adf80da3e95405 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 15:33:02 +0200 Subject: [PATCH 058/120] TST fix NCA's iris test taking code from master --- test/metric_learn_test.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index a2f5461c..53f698f0 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -90,15 +90,10 @@ def test_iris(self): n = self.iris_points.shape[0] # Without dimension reduction - nca = NCA(max_iter=(100000//n), num_dims=2, tol=1e-9) + nca = NCA(max_iter=(100000//n)) nca.fit(self.iris_points, self.iris_labels) - # Result copied from Iris example at - # https://github.com/vomjom/nca/blob/master/README.mkd - expected = [[-0.09935, -0.2215, 0.3383, 0.443], - [+0.2532, 0.5835, -0.8461, -0.8915], - [-0.729, -0.6386, 1.767, 1.832], - [-0.9405, -0.8461, 2.281, 2.794]] - assert_array_almost_equal(expected, nca.transformer_, decimal=3) + csep = class_separation(nca.transform(self.iris_points), self.iris_labels) + self.assertLess(csep, 0.15) # With dimension reduction nca = NCA(max_iter=(100000//n), learning_rate=0.01, num_dims=2) From 069b8e2cc9a574fcba2ae4d395c618643179fb4f Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 16:59:46 +0200 Subject: [PATCH 059/120] FIX fix tests: - use check_array instead of conversion to numpy array to ensure that sparse array are kept as such and not wrapped into a regular numpy array - check_array the metric with the right arguments for covariance in order not to fail if the array is a scalar (for one-feature samples) --- metric_learn/_util.py | 5 ++++- metric_learn/covariance.py | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 7578b23c..753c922a 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -63,7 +63,10 @@ def check_input(input, y=None, preprocessor=None, ensure_min_features=ensure_min_features, warn_on_dtype=warn_on_dtype, estimator=estimator) if y is None: - input = np.array(input, copy=False) + input = check_array(input, ensure_2d=False, allow_nd=True, + copy=False, force_all_finite=False, + accept_sparse=True, dtype=None, + ensure_min_features=0, ensure_min_samples=0) else: input, y = check_X_y(input, y, ensure_2d=False, allow_nd=True, copy=False, force_all_finite=False, diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 8237ccc9..f1a53541 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -45,5 +45,7 @@ def fit(self, X, y=None): else: self.M_ = np.linalg.inv(self.M_) - self.transformer_ = self.transformer_from_metric(check_array(self.M_)) + self.transformer_ = self.transformer_from_metric( + check_array(self.M_, ensure_min_samples=0,ensure_min_features=0, + ensure_2d=False)) return self From e8d8795bfd8ba4629fdbd4b8e059c3bd7b7bcf42 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 17:31:07 +0200 Subject: [PATCH 060/120] FIX fix previous modification that removed self.X_ but was modifying the input at fit time --- metric_learn/rca.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 90f1f4b4..4a592a66 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -73,13 +73,13 @@ def _process_data(self, X): # PCA projection to remove noise and redundant information. if self.pca_comps is not None: pca = decomposition.PCA(n_components=self.pca_comps) - X = pca.fit_transform(X) + X_transformed = pca.fit_transform(X) M_pca = pca.components_ else: - X -= X.mean(axis=0) + X_transformed = X - X.mean(axis=0) M_pca = None - return X, M_pca + return X_transformed, M_pca def _check_dimension(self, rank, X): d = X.shape[1] From 7c539c71f04b09af0f2d95086cdbef8d98d5e52b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 16 Oct 2018 17:46:35 +0200 Subject: [PATCH 061/120] FIX ensure at least 2d only for checking the metric because after check_array of inputs the metric should be a scalar or an array so we only need to ensure it is an array (2D array) --- metric_learn/covariance.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index f1a53541..786d72ac 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -45,7 +45,5 @@ def fit(self, X, y=None): else: self.M_ = np.linalg.inv(self.M_) - self.transformer_ = self.transformer_from_metric( - check_array(self.M_, ensure_min_samples=0,ensure_min_features=0, - ensure_2d=False)) + self.transformer_ = self.transformer_from_metric(np.atleast_2d(self.M_)) return self From f801fae73139a13d9df1e639aefb635694d402b7 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Fri, 9 Nov 2018 15:22:33 +0100 Subject: [PATCH 062/120] STY: Fix PEP8 violations --- metric_learn/_util.py | 72 +++++++++++++++------------------- metric_learn/base_metric.py | 6 +-- metric_learn/covariance.py | 1 - test/test_mahalanobis_mixin.py | 4 +- test/test_utils.py | 41 ++++++++++--------- test/test_weakly_supervised.py | 1 + 6 files changed, 60 insertions(+), 65 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 753c922a..104bd027 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -83,12 +83,12 @@ def check_input(input, y=None, preprocessor=None, input = preprocess_points(input, preprocessor) preprocessor_has_been_applied = True else: - raise ValueError('2D array of formed points expected{}. Found {}D array ' - 'instead:\ninput={}. Reshape ' - 'your data and/or use a preprocessor.\n'.format( - context, input.ndim, input)) + raise ValueError('2D array of formed points expected{}. ' + 'Found {}D array instead:\ninput={}. Reshape ' + 'your data and/or use a preprocessor.\n' + .format(context, input.ndim, input)) elif input.ndim == 2: - pass # OK + pass # OK else: with_prep = (('1D array of indicators or 2D array of formed points', ' when using a preprocessor') @@ -100,18 +100,15 @@ def check_input(input, y=None, preprocessor=None, input.ndim, input)) input = check_array(input, allow_nd=True, ensure_2d=False, - # arguments that come from before **args_for_sk_checks) if input.ndim != 2: # we have to ensure this because check_array above - # does not - + # does not raise ValueError('2D array of formed points expected{}{}. ' - 'Found {}D array ' - 'instead:\ninput={}. Reshape your data ' - 'and/or use a preprocessor.\n'.format(context, - ' after the preprocessor has been applied' if - preprocessor_has_been_applied else '', - input.ndim, input)) + 'Found {}D array instead:\ninput={}. Reshape your data ' + 'and/or use a preprocessor.\n' + .format(context, ' after the preprocessor has been ' + 'applied' if preprocessor_has_been_applied + else '', input.ndim, input)) elif type_of_inputs == 'tuples': if input.ndim == 2: @@ -121,49 +118,42 @@ def check_input(input, y=None, preprocessor=None, else: raise ValueError('3D array of formed tuples expected{}. ' - 'Found {}D array ' - 'instead:\ninput={}. ' - 'Reshape your data ' - 'and/or use a preprocessor.\n'.format(context, - input.ndim, - input)) + 'Found {}D array instead:\ninput={}. ' + 'Reshape your data and/or use a preprocessor.\n' + .format(context, input.ndim, input)) elif input.ndim == 3: # we should check_num_features which is not checked - # after + # after pass else: - with_prep =(('2D array of indicators or 3D array of formed tuples', - ' when using a preprocessor') - if preprocessor is not None else - ('3D array of formed tuples', '')) + with_prep = (('2D array of indicators or 3D array of formed tuples', + ' when using a preprocessor') + if preprocessor is not None else + ('3D array of formed tuples', '')) raise ValueError("{} expected{}{}. Found {}D array " "instead:\ninput={}. Reshape your data.\n" .format(with_prep[0], context, with_prep[1], input.ndim, input)) input = check_array(input, allow_nd=True, ensure_2d=False, - # arguments that come from before - **args_for_sk_checks) + **args_for_sk_checks) if ensure_min_features > 0: n_features = input.shape[2] if n_features < ensure_min_features: raise ValueError("Found array with {} feature(s) (shape={}) while" - " a minimum of {} is required{}." - .format(n_features, input.shape, - ensure_min_features, context)) - # normally we don't need to check_t too because t should'nt be able to + " a minimum of {} is required{}." + .format(n_features, input.shape, + ensure_min_features, context)) + # normally we don't need to check_t too because t should'nt be able to # be modified by any preprocessor if input.ndim != 3: # we have to ensure this because check_array above # does not - raise ValueError('3D array of formed tuples expected{}{}. ' - 'Found {}D array ' - 'instead:\ninput={}. Reshape your data ' - 'and/or use a preprocessor.\n'.format(context, - ' after the preprocessor has been applied' if - preprocessor_has_been_applied else '', - input.ndim, input)) - + 'Found {}D array instead:\ninput={}. Reshape your data ' + 'and/or use a preprocessor.\n' + .format(context, ' after the preprocessor has been ' + 'applied' if preprocessor_has_been_applied + else '', input.ndim, input)) check_t(input, t, context) return input if y is None else (input, y) @@ -172,7 +162,7 @@ def check_input(input, y=None, preprocessor=None, def preprocess_tuples(tuples, preprocessor): print("Preprocessing tuples...") tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for - i in range(tuples.shape[1])]) + i in range(tuples.shape[1])]) return tuples @@ -221,7 +211,7 @@ def check_t(tuples, t, context): raise ValueError(msg_t) -class ArrayIndexer(): +class ArrayIndexer: def __init__(self, X): self.X = X diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 0af0dcca..b8730085 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -189,7 +189,7 @@ def predict(self, pairs): """ pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, - estimator=self, t=self._t) + estimator=self, t=self._t) return self.score_pairs(pairs) def decision_function(self, pairs): @@ -251,8 +251,8 @@ def predict(self, quadruplets): Predictions of the ordering of pairs, for each quadruplet. """ quadruplets = check_input(quadruplets, type_of_inputs='tuples', - preprocessor=self.preprocessor_, - estimator=self, t=self._t) + preprocessor=self.preprocessor_, + estimator=self, t=self._t) # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices return (self.score_pairs(quadruplets[:, :2, ...]) - diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 786d72ac..81b6065f 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -10,7 +10,6 @@ from __future__ import absolute_import import numpy as np -from sklearn.utils.validation import check_array from sklearn.base import TransformerMixin from metric_learn._util import check_input diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 889fef85..ee0c29e6 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -190,8 +190,8 @@ def test_embed_dim(estimator, build_dataset): # assert that ValueError is thrown if input shape is 1D context = make_context(estimator) err_msg = ("2D array of formed points expected{}. Found 1D array " - "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n" - .format(context, X[0])) + "instead:\ninput={}. Reshape your data and/or use a " + "preprocessor.\n".format(context, X[0])) with pytest.raises(ValueError) as raised_error: model.score_pairs(model.transform(X[0, :])) assert str(raised_error.value) == err_msg diff --git a/test/test_utils.py b/test/test_utils.py index c0f4920c..d3c5ea67 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -13,6 +13,7 @@ MMC_Supervised, RCA_Supervised, SDML_Supervised) from sklearn.datasets import make_regression, make_blobs + def mock_preprocessor(indices): """A preprocessor for testing purposes that returns an all ones 3D array """ @@ -20,6 +21,7 @@ def mock_preprocessor(indices): # ---------------------------- test check_input ---------------------------- + @pytest.fixture def tuples_prep(): """Basic array for testing when using a preprocessor""" @@ -82,12 +84,13 @@ def test_check_input_invalid_t(estimator, context, load_tuples, preprocessor): 'of formed tuples', mock_preprocessor), ([1, 2], '1', '3D array of formed tuples', None), ([[[[5]]]], '4', '2D array of indicators or 3D array' - ' of formed tuples', mock_preprocessor), + ' of formed tuples', + mock_preprocessor), ([[[[5]]]], '4', '3D array of formed tuples', None), ([[1], [3]], '2', '3D array of formed ' 'tuples', None)]) def test_check_input_invalid_shape(estimator, context, tuples, found, - expected, preprocessor): + expected, preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (not 2D with preprocessor or 3D with no preprocessor) """ @@ -128,13 +131,13 @@ def test_check_input_invalid_n_features(estimator, context, tuples_no_prep): (tuples_no_prep, None), (tuples_no_prep, mock_preprocessor)]) def test_check_input_invalid_n_samples(estimator, context, load_tuples, - preprocessor): + preprocessor): """Checks that the right warning is printed if n_samples is too small""" tuples = load_tuples() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " "is required{}.".format((preprocess_tuples(tuples, preprocessor) if (preprocessor is not None and - tuples.ndim == 2) else tuples).shape, + tuples.ndim == 2) else tuples).shape, context)) with pytest.raises(ValueError) as raised_error: check_input(tuples, type_of_inputs='tuples', @@ -150,7 +153,7 @@ def test_check_input_invalid_n_samples(estimator, context, load_tuples, (tuples_no_prep, None), (tuples_no_prep, mock_preprocessor)]) def test_check_input_invalid_dtype_convertible(estimator, context, - load_tuples, preprocessor): + load_tuples, preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" tuples = load_tuples().astype(object) # here the object conversion is @@ -158,7 +161,7 @@ def test_check_input_invalid_dtype_convertible(estimator, context, # tuples_prep case if preprocessor is not None: # if the preprocessor is not None we - # overwrite it to have a preprocessor that returns objects + # overwrite it to have a preprocessor that returns objects def preprocessor(indices): # # preprocessor that returns objects return np.ones((indices.shape[0], 3)).astype(object) @@ -319,7 +322,7 @@ def points_no_prep(): mock_preprocessor), ([[[5]]], '3', '2D array of formed points', None)]) def test_check_input_points_invalid_shape(estimator, context, points, found, - expected, preprocessor): + expected, preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (valid being 1D or 2D with preprocessor or 2D with no preprocessor) @@ -340,7 +343,8 @@ def test_check_input_points_invalid_shape(estimator, context, points, found, @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -def test_check_input_invalid_n_features(estimator, context, points_no_prep): +def test_check_input_point_invalid_n_features(estimator, context, + points_no_prep): """Checks that the right warning is printed if not enough features Here we only test if no preprocessor (otherwise we don't ensure this) """ @@ -350,7 +354,7 @@ def test_check_input_invalid_n_features(estimator, context, points_no_prep): with pytest.raises(ValueError) as raised_error: check_input(points_no_prep, type_of_inputs='classic', preprocessor=None, ensure_min_features=3, - estimator=estimator) + estimator=estimator) assert str(raised_error.value) == msg @@ -361,18 +365,18 @@ def test_check_input_invalid_n_features(estimator, context, points_no_prep): (points_no_prep, None), (points_no_prep, mock_preprocessor)]) def test_check_input_point_invalid_n_samples(estimator, context, load_points, - preprocessor): + preprocessor): """Checks that the right warning is printed if n_samples is too small""" points = load_points() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " "is required{}.".format((preprocess_points(points, - preprocessor) + preprocessor) if preprocessor is not None and - points.ndim == 1 else + points.ndim == 1 else points).shape, context)) with pytest.raises(ValueError) as raised_error: - check_input(points, type_of_inputs='classic',preprocessor=preprocessor, + check_input(points, type_of_inputs='classic', preprocessor=preprocessor, ensure_min_samples=3, estimator=estimator) assert str(raised_error.value) == msg @@ -385,7 +389,8 @@ def test_check_input_point_invalid_n_samples(estimator, context, load_points, (points_no_prep, None), (points_no_prep, mock_preprocessor)]) def test_check_input_point_invalid_dtype_convertible(estimator, context, - load_points, preprocessor): + load_points, + preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" points = load_points().astype(object) # here the object conversion is @@ -393,8 +398,8 @@ def test_check_input_point_invalid_dtype_convertible(estimator, context, # points_prep case if preprocessor is not None: # if the preprocessor is not None we - # overwrite it to have a preprocessor that returns objects - def preprocessor(indices): # + # overwrite it to have a preprocessor that returns objects + def preprocessor(indices): # preprocessor that returns objects return np.ones((indices.shape[0], 3)).astype(object) @@ -588,7 +593,7 @@ def test_preprocess_tuples_invalid_message(estimator): input is detected as weird. Checks this for preprocess_tuples.""" context = make_context(estimator) + (' after the preprocessor ' - 'has been applied') + 'has been applied') def preprocessor(sequence): return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D @@ -610,7 +615,7 @@ def test_preprocess_points_invalid_message(estimator): input is detected as weird. Checks this for preprocess_points.""" context = make_context(estimator) + (' after the preprocessor ' - 'has been applied') + 'has been applied') def preprocessor(sequence): return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 116af5db..41b31807 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -19,6 +19,7 @@ RNG = check_random_state(0) + def mock_preprocessor(indices): """A preprocessor for testing purposes that returns an all ones 3D array """ From b38b223e469642c3a8873bde53904183bd11ba6b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 12 Nov 2018 14:00:56 +0100 Subject: [PATCH 063/120] MAINT: Refactor error messages with the help of numerical codes --- metric_learn/_util.py | 86 ++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 104bd027..8de05d7c 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -83,32 +83,23 @@ def check_input(input, y=None, preprocessor=None, input = preprocess_points(input, preprocessor) preprocessor_has_been_applied = True else: - raise ValueError('2D array of formed points expected{}. ' - 'Found {}D array instead:\ninput={}. Reshape ' - 'your data and/or use a preprocessor.\n' - .format(context, input.ndim, input)) + make_error_input(101, input, context) elif input.ndim == 2: pass # OK else: - with_prep = (('1D array of indicators or 2D array of formed points', - ' when using a preprocessor') - if preprocessor is not None else - ('2D array of formed points', '')) - raise ValueError("{} expected{}{}. Found {}D array " - "instead:\ninput={}. Reshape your data.\n" - .format(with_prep[0], context, with_prep[1], - input.ndim, input)) + if preprocessor is not None: + make_error_input(320, input, context) + else: + make_error_input(100, input, context) input = check_array(input, allow_nd=True, ensure_2d=False, **args_for_sk_checks) if input.ndim != 2: # we have to ensure this because check_array above # does not - raise ValueError('2D array of formed points expected{}{}. ' - 'Found {}D array instead:\ninput={}. Reshape your data ' - 'and/or use a preprocessor.\n' - .format(context, ' after the preprocessor has been ' - 'applied' if preprocessor_has_been_applied - else '', input.ndim, input)) + if preprocessor_has_been_applied: + make_error_input(111, input, context) + else: + make_error_input(101, input, context) elif type_of_inputs == 'tuples': if input.ndim == 2: @@ -116,24 +107,15 @@ def check_input(input, y=None, preprocessor=None, input = preprocess_tuples(input, preprocessor) preprocessor_has_been_applied = True else: - - raise ValueError('3D array of formed tuples expected{}. ' - 'Found {}D array instead:\ninput={}. ' - 'Reshape your data and/or use a preprocessor.\n' - .format(context, input.ndim, input)) + make_error_input(201, input, context) elif input.ndim == 3: # we should check_num_features which is not checked # after pass else: - - with_prep = (('2D array of indicators or 3D array of formed tuples', - ' when using a preprocessor') - if preprocessor is not None else - ('3D array of formed tuples', '')) - raise ValueError("{} expected{}{}. Found {}D array " - "instead:\ninput={}. Reshape your data.\n" - .format(with_prep[0], context, with_prep[1], - input.ndim, input)) + if preprocessor is not None: + make_error_input(420, input, context) + else: + make_error_input(200, input, context) input = check_array(input, allow_nd=True, ensure_2d=False, **args_for_sk_checks) @@ -148,17 +130,45 @@ def check_input(input, y=None, preprocessor=None, # be modified by any preprocessor if input.ndim != 3: # we have to ensure this because check_array above # does not - raise ValueError('3D array of formed tuples expected{}{}. ' - 'Found {}D array instead:\ninput={}. Reshape your data ' - 'and/or use a preprocessor.\n' - .format(context, ' after the preprocessor has been ' - 'applied' if preprocessor_has_been_applied - else '', input.ndim, input)) + if preprocessor_has_been_applied: + make_error_input(211, input, context) + else: + make_error_input(201, input, context) check_t(input, t, context) return input if y is None else (input, y) +def make_error_input(code, input, context): + + code_str = {'expected_input': {'1': '2D array of formed points', + '2': '3D array of formed tuples', + '3': ('1D array of indicators or 2D array of ' + 'formed points'), + '4': ('2D array of indicators or 3D array ' + 'of formed tuples')}, + 'additional_context': {'0': '', + '2': ' when using a preprocessor', + '1': (' after the preprocessor has been ' + 'applied')}, + 'possible_preprocessor': {'0': '', + '1': ' and/or use a preprocessor' + }} + code_list = list(str(code)) + err_args = dict(expected_input=code_str['expected_input'][code_list[0]], + additional_context=code_str['additional_context'] + [code_list[1]], + possible_preprocessor=code_str['possible_preprocessor'] + [code_list[2]], + input=input, context=context, found_size=input.ndim) + err_msg = ('{expected_input} expected' + '{context}{additional_context}. Found {found_size}D array ' + 'instead:\ninput={input}. Reshape your data' + '{possible_preprocessor}.\n') + raise ValueError(err_msg.format_map(err_args)) + # raise ValueError(code, err_msg.format_map(err_args)) + + def preprocess_tuples(tuples, preprocessor): print("Preprocessing tuples...") tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for From cc6d6616000aa4b072d17c34c9a4b1daba4606c8 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 12 Nov 2018 15:35:28 +0100 Subject: [PATCH 064/120] MAINT: mutualize check_preprocessor and check_input for every estimator --- metric_learn/_util.py | 2 +- metric_learn/base_metric.py | 10 +++++++++- metric_learn/covariance.py | 5 +---- metric_learn/itml.py | 11 +++-------- metric_learn/lfda.py | 4 +--- metric_learn/lmnn.py | 6 ++---- metric_learn/lsml.py | 11 +++-------- metric_learn/mlkr.py | 5 +---- metric_learn/mmc.py | 11 +++-------- metric_learn/nca.py | 5 +---- metric_learn/rca.py | 9 ++------- metric_learn/sdml.py | 10 +++------- 12 files changed, 30 insertions(+), 59 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 8de05d7c..4b038396 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -154,7 +154,7 @@ def make_error_input(code, input, context): 'possible_preprocessor': {'0': '', '1': ' and/or use a preprocessor' }} - code_list = list(str(code)) + code_list = str(code) err_args = dict(expected_input=code_str['expected_input'][code_list[0]], additional_context=code_str['additional_context'] [code_list[1]], diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index b8730085..c28a3c8c 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -37,11 +37,19 @@ def score_pairs(self, pairs): The score of every pair. """ - def check_preprocessor(self): + def initialize_and_check_inputs(self, X, y=None, + type_of_inputs='classic', + **kwargs): if _is_arraylike(self.preprocessor): self.preprocessor_ = ArrayIndexer(self.preprocessor) else: self.preprocessor_ = self.preprocessor + return check_input(X, y, + type_of_inputs=type_of_inputs, + preprocessor=self.preprocessor_, + estimator=self, + t=self._t if hasattr(self, '_t') else None, + **kwargs) class MetricTransformer(six.with_metaclass(ABCMeta)): diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 81b6065f..d827235a 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -34,10 +34,7 @@ def fit(self, X, y=None): X : data matrix, (n x d) y : unused """ - self.check_preprocessor() - self.X_ = check_input(X, type_of_inputs='classic', - ensure_min_samples=2, estimator=self, - preprocessor=self.preprocessor_) + self.X_ = self.initialize_and_check_inputs(X, ensure_min_samples=2) self.M_ = np.cov(self.X_, rowvar = False) if self.M_.ndim == 0: self.M_ = 1./self.M_ diff --git a/metric_learn/itml.py b/metric_learn/itml.py index fb8e32c5..032665eb 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -60,10 +60,8 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, super(_BaseITML, self).__init__(preprocessor) def _process_pairs(self, pairs, y, bounds): - self.check_preprocessor() - pairs, y = check_input(pairs, y, type_of_inputs='tuples', - estimator=self, t=self._t, - preprocessor=self.preprocessor_) + pairs, y = self.initialize_and_check_inputs(pairs, y, + type_of_inputs='tuples') # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -230,10 +228,7 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - self.check_preprocessor() - X, y = check_input(X, y, type_of_inputs='classic', - preprocessor=self.preprocessor_, - estimator=self, ensure_min_samples=2) + X, y = self.initialize_and_check_inputs(X, y, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 497e2ecc..b8a02cdf 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -62,9 +62,7 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted', def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) - self.check_preprocessor() - self.X_, y = check_input(X, y, type_of_inputs='classic', estimator=self, - preprocessor=self.preprocessor_) + self.X_, y = self.initialize_and_check_inputs(X, y) n, d = self.X_.shape num_classes = len(unique_classes) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 81fbee3e..e3090d4e 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -50,10 +50,8 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, class python_LMNN(_base_LMNN): def _process_inputs(self, X, labels): - self.check_preprocessor() - self.X_, labels = check_input(X, labels, type_of_inputs='classic', - preprocessor=self.preprocessor_, - estimator=self, ensure_min_samples=2) + self.X_, labels = self.initialize_and_check_inputs(X, labels, + ensure_min_samples=2) self.X_ = self.X_.astype(float) # todo: remove the conversion here and # integrate it into check_input num_pts, num_dims = self.X_.shape diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 3e003063..fdad616c 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -45,10 +45,8 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, super(_BaseLSML, self).__init__(preprocessor) def _prepare_quadruplets(self, quadruplets, weights): - self.check_preprocessor() - quadruplets = check_input(quadruplets, type_of_inputs='tuples', - estimator=self, t=self._t, - preprocessor=self.preprocessor_) + quadruplets = self.initialize_and_check_inputs(quadruplets, + type_of_inputs='tuples') # check to make sure that no two constrained vectors are identical self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :] @@ -218,10 +216,7 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - self.check_preprocessor() - X, y = check_input(X, y, type_of_inputs='classic', - preprocessor=self.preprocessor_, - estimator=self, ensure_min_samples=2) + X, y = self.initialize_and_check_inputs(X, y, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 83240dce..c6cee563 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -59,10 +59,7 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, super(MLKR, self).__init__(preprocessor) def _process_inputs(self, X, y): - self.check_preprocessor() - self.X_, y = check_input(X, y, type_of_inputs='classic', - y_numeric=True, estimator=self, - preprocessor=self.preprocessor_) + self.X_, y = self.initialize_and_check_inputs(X, y, y_numeric=True) n, d = self.X_.shape if y.shape[0] != n: raise ValueError('Data and label lengths mismatch: %d != %d' diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 71606e10..b923335e 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -66,7 +66,6 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, super(_BaseMMC, self).__init__(preprocessor) def _fit(self, pairs, y): - self.check_preprocessor() pairs, y = self._process_pairs(pairs, y) if self.diagonal: return self._fit_diag(pairs, y) @@ -74,9 +73,8 @@ def _fit(self, pairs, y): return self._fit_full(pairs, y) def _process_pairs(self, pairs, y): - pairs, y = check_input(pairs, y, type_of_inputs='tuples', - estimator=self, t=self._t, - preprocessor=self.preprocessor_) + pairs, y = self.initialize_and_check_inputs(pairs, y, + type_of_inputs='tuples') # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -452,10 +450,7 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - self.check_preprocessor() - X, y = check_input(X, y, type_of_inputs='classic', - preprocessor=self.preprocessor_, - estimator=self, ensure_min_samples=2) + X, y = self.initialize_and_check_inputs(X, y, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 79f51e34..ae7876ea 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -35,10 +35,7 @@ def fit(self, X, y): X: data matrix, (n x d) y: scalar labels, (n) """ - self.check_preprocessor() - - X, labels = check_input(X, y, type_of_inputs='classic', estimator=self, - preprocessor=self.preprocessor_) + X, labels = self.initialize_and_check_inputs(X, y) n, d = X.shape num_dims = self.num_dims if num_dims is None: diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 4a592a66..42d38555 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -65,10 +65,7 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): super(RCA, self).__init__(preprocessor) def _process_data(self, X): - self.check_preprocessor() - X = check_input(X, type_of_inputs='classic', - preprocessor=self.preprocessor_, - estimator=self) + X = self.initialize_and_check_inputs(X) # PCA projection to remove noise and redundant information. if self.pca_comps is not None: @@ -181,9 +178,7 @@ def fit(self, X, y, random_state=np.random): y : (n) data labels random_state : a random.seed object to fix the random_state if needed. """ - self.check_preprocessor() - X, y = check_input(X, y, type_of_inputs='classic', estimator=self, - preprocessor=self.preprocessor_) + X, y = self.initialize_and_check_inputs(X, y) chunks = Constraints(y).chunks(num_chunks=self.num_chunks, chunk_size=self.chunk_size, random_state=random_state) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 16cca5c4..83d51acc 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -51,10 +51,8 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, super(_BaseSDML, self).__init__(preprocessor) def _prepare_pairs(self, pairs, y): - self.check_preprocessor() - pairs, y = check_input(pairs, y, type_of_inputs='tuples', - estimator=self, t=self._t, - preprocessor=self.preprocessor_) + pairs, y = self.initialize_and_check_inputs(pairs, y, + type_of_inputs='tuples') # set up prior M if self.use_cov: @@ -161,9 +159,7 @@ def fit(self, X, y, random_state=np.random): self : object Returns the instance. """ - self.check_preprocessor() - X, y = check_input(X, y, type_of_inputs='classic', estimator=self, - preprocessor=self.preprocessor_) + X, y = self.initialize_and_check_inputs(X, y) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) From 192a0426a3e570d9017fbc05185d1bd561f7be75 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 12 Nov 2018 15:44:53 +0100 Subject: [PATCH 065/120] FIX: remove format_map for python2.7 compatibility --- metric_learn/_util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 4b038396..ba6bdff9 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -165,8 +165,7 @@ def make_error_input(code, input, context): '{context}{additional_context}. Found {found_size}D array ' 'instead:\ninput={input}. Reshape your data' '{possible_preprocessor}.\n') - raise ValueError(err_msg.format_map(err_args)) - # raise ValueError(code, err_msg.format_map(err_args)) + raise ValueError(err_msg.format(**err_args)) def preprocess_tuples(tuples, preprocessor): From 0328941e8fae6b4128eebe137492cd9612ce8a26 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 12 Nov 2018 17:23:54 +0100 Subject: [PATCH 066/120] DOC: Add docstring for check_input and fix some bugs --- metric_learn/_util.py | 94 +++++++++++++++++++++++++++---------- metric_learn/base_metric.py | 9 ++-- metric_learn/lmnn.py | 7 +-- 3 files changed, 76 insertions(+), 34 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index ba6bdff9..cb9662e4 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -21,38 +21,83 @@ def check_input(input, y=None, preprocessor=None, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, warn_on_dtype=False, estimator=None): - """Checks that the input format is valid and does conversions if specified + """Checks that the input format is valid, and converts it if specified (this is the equivalent of scikit-learn's `check_array` or `check_X_y`). - All arguments following t are scikit-learn's `check_array` or `check_X_y` - arguments that will be enforced on the output array + All arguments following t are scikit-learn's `check_X_y` + arguments that will be enforced on the data and labels array. If + indicators are given as an input data array, the returned data array + will be the formed points/tuples, using the given preprocessor. Parameters ---------- - input : object - The input to check - y : object (optional, default=None) - The - preprocessor - type_of_inputs - t - accept_sparse - dtype - order - copy - force_all_finite - multi_output - ensure_min_samples - ensure_min_features - y_numeric - warn_on_dtype - estimator + input: array-like + The input to check. + + preprocessor: callable (default=None) + The preprocessor to use. If None, no preprocessor is used. + + type_of_inputs: `str` {'classic', 'tuples'} + The type of inputs to check. If 'classic', the input should be + a 2D array-like of points or a 1D array like of indicators of points. If + 'tuples', the input should be a 3D array-like of tuples or a 2D + array-like of indicators of tuples. + + accept_sparse: `bool` + Set to true to allow sparse inputs (only works for sparse inputs with + dim < 3). + + t : int + The number of elements in a tuple (e.g. 2 for pairs). + + dtype : string, type, list of types or None (default="auto") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. If + "auto", will we be set to "numeric" if `preprocessor=True`, + else to None. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean or 'allow-nan', (default=True) + Whether to raise an error on np.inf and np.nan in X. This parameter + does not influence whether y can have np.inf or np.nan values. + The possibilities are: + - True: Force all values of X to be finite. + - False: accept both np.inf and np.nan in X. + - 'allow-nan': accept only np.nan values in X. Values cannot be + infinite. + + ensure_min_samples : int (default=1) + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when X has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. + + warn_on_dtype : boolean (default=False) + Raise DataConversionWarning if the dtype of the input data structure + does not match the requested dtype, causing a memory copy. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. Returns ------- - + X : `numpy.ndarray` + The checked input data array. + y: `numpy.ndarray` (optional) + The checked input labels array. """ - # todo: faire attention a la copie - # todo: faire attention aux trucs sparses context = make_context(estimator) @@ -140,7 +185,6 @@ def check_input(input, y=None, preprocessor=None, def make_error_input(code, input, context): - code_str = {'expected_input': {'1': '2D array of formed points', '2': '3D array of formed tuples', '3': ('1D array of indicators or 2D array of ' diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index c28a3c8c..a6da46eb 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -37,13 +37,16 @@ def score_pairs(self, pairs): The score of every pair. """ - def initialize_and_check_inputs(self, X, y=None, - type_of_inputs='classic', - **kwargs): + def check_preprocessor(self): if _is_arraylike(self.preprocessor): self.preprocessor_ = ArrayIndexer(self.preprocessor) else: self.preprocessor_ = self.preprocessor + + def initialize_and_check_inputs(self, X, y=None, + type_of_inputs='classic', + **kwargs): + self.check_preprocessor() return check_input(X, y, type_of_inputs=type_of_inputs, preprocessor=self.preprocessor_, diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index e3090d4e..a9f3c309 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -257,12 +257,7 @@ class LMNN(_base_LMNN): """ def fit(self, X, y): - self.check_preprocessor() - self.X_, y = check_input(X, y, type_of_inputs='classic', - estimator=self, - preprocessor=self.preprocessor_) - self.X_ = self.X_.astype(float) # todo: remove the conversion here and - # integrate it into check_input + self.X_, y = self.initialize_and_check_inputs(X, y, dtype=float) self.X_ = self.X_, preprocessor=self.preprocessor_ labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) From 00078c20bb70ffef58ae9e52f82a4ab48d26479b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 12 Nov 2018 17:49:28 +0100 Subject: [PATCH 067/120] DOC: add docstrings --- metric_learn/_util.py | 6 +++++- metric_learn/base_metric.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index cb9662e4..151f4226 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -31,7 +31,10 @@ def check_input(input, y=None, preprocessor=None, Parameters ---------- input: array-like - The input to check. + The input data array to check. + + y : array-like + The input labels array to check. preprocessor: callable (default=None) The preprocessor to use. If None, no preprocessor is used. @@ -95,6 +98,7 @@ def check_input(input, y=None, preprocessor=None, ------- X : `numpy.ndarray` The checked input data array. + y: `numpy.ndarray` (optional) The checked input labels array. """ diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index a6da46eb..9536c960 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -38,14 +38,42 @@ def score_pairs(self, pairs): """ def check_preprocessor(self): + """Initializes the preprocessor""" if _is_arraylike(self.preprocessor): self.preprocessor_ = ArrayIndexer(self.preprocessor) else: self.preprocessor_ = self.preprocessor - def initialize_and_check_inputs(self, X, y=None, - type_of_inputs='classic', + def initialize_and_check_inputs(self, X, y=None, type_of_inputs='classic', **kwargs): + """Initializes the preprocessor and processes inputs. See `check_input` + for more details. + + Parameters + ---------- + input: array-like + The input data array to check. + + y : array-like + The input labels array to check. + + type_of_inputs: `str` {'classic', 'tuples'} + The type of inputs to check. If 'classic', the input should be + a 2D array-like of points or a 1D array like of indicators of points. If + 'tuples', the input should be a 3D array-like of tuples or a 2D + array-like of indicators of tuples. + + **kwargs: dict + Arguments to pass to check_input. + + Returns + ------- + X : `numpy.ndarray` + The checked input data array. + + y: `numpy.ndarray` (optional) + The checked input labels array. + """ self.check_preprocessor() return check_input(X, y, type_of_inputs=type_of_inputs, From 1ded46af3178c3f7f006d28548fcf5e138025cfa Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 09:58:44 +0100 Subject: [PATCH 068/120] MAINT: Removing changes not related to this PR, and fixing previous probable unsuccessfull merge --- metric_learn/base_metric.py | 8 ++++---- metric_learn/covariance.py | 2 +- metric_learn/lfda.py | 4 +--- metric_learn/lmnn.py | 3 +-- metric_learn/lsml.py | 9 ++++----- metric_learn/mlkr.py | 1 - metric_learn/nca.py | 2 +- metric_learn/rca.py | 3 +-- metric_learn/sdml.py | 2 +- test/metric_learn_test.py | 6 +++--- test/test_base_metric.py | 1 - test/test_fit_transform.py | 6 +++--- test/test_sklearn_compat.py | 6 +++--- test/test_weakly_supervised.py | 34 +++++++++++++++++----------------- 14 files changed, 40 insertions(+), 47 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 9536c960..2f6954b4 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -118,7 +118,7 @@ class MahalanobisMixin(six.with_metaclass(ABCMeta, BaseMetricLearner, Attributes ---------- transformer_ : `numpy.ndarray`, shape=(num_dims, n_features) - The learned linear transformation ``L``. + The learned linear transformation ``L``. """ def score_pairs(self, pairs): @@ -294,13 +294,13 @@ def predict(self, quadruplets): estimator=self, t=self._t) # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices - return (self.score_pairs(quadruplets[:, :2, ...]) - - self.score_pairs(quadruplets[:, 2:, ...])) + return np.sign(self.decision_function(quadruplets)) def decision_function(self, quadruplets): # no need to check_input since it is done in # predict->score_pairs - return self.predict(quadruplets) + return (self.score_pairs(quadruplets[:, :2, :]) - + self.score_pairs(quadruplets[:, 2:, :])) def score(self, quadruplets, y=None): """Computes score on input quadruplets diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index d827235a..ce40dcaa 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -10,9 +10,9 @@ from __future__ import absolute_import import numpy as np +from metric_learn._util import check_input from sklearn.base import TransformerMixin -from metric_learn._util import check_input from .base_metric import MahalanobisMixin diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index b8a02cdf..cea18882 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -16,9 +16,8 @@ import warnings from six.moves import xrange from sklearn.metrics import pairwise_distances -from sklearn.base import TransformerMixin - from metric_learn._util import check_input +from sklearn.base import TransformerMixin from .base_metric import MahalanobisMixin @@ -94,7 +93,6 @@ def fit(self, X, y): y : (n,) array-like Class labels, one per point of data. ''' - X, y, num_classes, n, d, dim, k_ = self._process_inputs(X, y) tSb = np.zeros((d,d)) tSw = np.zeros((d,d)) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index a9f3c309..8135f2be 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -14,9 +14,9 @@ import warnings from collections import Counter from six.moves import xrange +from metric_learn._util import check_input from sklearn.metrics import euclidean_distances from sklearn.base import TransformerMixin -from metric_learn._util import check_input from .base_metric import MahalanobisMixin @@ -68,7 +68,6 @@ def _process_inputs(self, X, labels): ' (smallest class has %d)' % required_k) def fit(self, X, y): - k = self.k reg = self.regularization learn_rate = self.learn_rate diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index fdad616c..483316b9 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -149,12 +149,11 @@ def fit(self, quadruplets, weights=None): ---------- quadruplets : array-like, shape=(n_constraints, 4, n_features) or (n_constraints, 4) - 3D array with each row (element took from the first dimension) - corresponding to 4 points. In order to supervise the algorithm in the - right way, we should have the four samples ordered in a way such that: + 3D array-like of quadruplets of points or 2D array of quadruplets of + indicators. In order to supervise the algorithm in the right way, we + should have the four samples ordered in a way such that: d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i < - n_constraints. If the instance was created with a preprocessor, it can - also be fitted on 2D arrays of indices of quadruplets. + n_constraints. weights : (n_constraints,) array of floats, optional scale factor for each constraint diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index c6cee563..68095ce3 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -87,7 +87,6 @@ def fit(self, X, y): X : (n x d) array of samples y : (n) data labels """ - X, y, A = self._process_inputs(X, y) # note: this line takes (n*n*d) memory! diff --git a/metric_learn/nca.py b/metric_learn/nca.py index ae7876ea..635a5986 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -7,8 +7,8 @@ import numpy as np from six.moves import xrange from sklearn.base import TransformerMixin - from metric_learn._util import check_input + from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 42d38555..38d1b408 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -17,8 +17,8 @@ from six.moves import xrange from sklearn import decomposition from sklearn.base import TransformerMixin - from metric_learn._util import check_input + from .base_metric import MahalanobisMixin from .constraints import Constraints @@ -109,7 +109,6 @@ def fit(self, data, chunks): When ``chunks[i] == -1``, point i doesn't belong to any chunklet. When ``chunks[i] == j``, point i belongs to chunklet j. """ - data, M_pca = self._process_data(data) chunks = np.asanyarray(chunks, dtype=int) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 83d51acc..a7011404 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -13,8 +13,8 @@ from sklearn.base import TransformerMixin from sklearn.covariance import graph_lasso from sklearn.utils.extmath import pinvh - from metric_learn._util import check_input + from .base_metric import MahalanobisMixin, _PairsClassifierMixin from .constraints import Constraints, wrap_pairs diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 53f698f0..7b3a83a8 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -5,9 +5,9 @@ from sklearn.datasets import load_iris from numpy.testing import assert_array_almost_equal -from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, - LSML_Supervised, ITML_Supervised, SDML_Supervised, - RCA_Supervised, MMC_Supervised) +from metric_learn import ( + LMNN, NCA, LFDA, Covariance, MLKR, MMC, + LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised) # Import this specially for testing. from metric_learn.constraints import wrap_pairs from metric_learn.lmnn import python_LMNN diff --git a/test/test_base_metric.py b/test/test_base_metric.py index 8964a2c3..d71bf760 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -11,7 +11,6 @@ def test_covariance(self): def test_lmnn(self): self.assertRegexpMatches( str(metric_learn.LMNN()), - r"(python_)?LMNN\(convergence_tol=0.001, k=3, learn_rate=1e-07, " r"max_iter=1000,\n min_iter=50, preprocessor=None, " r"regularization=0.5, use_pca=True,\n verbose=False\)") diff --git a/test/test_fit_transform.py b/test/test_fit_transform.py index 41a7dfd2..f898a0fe 100644 --- a/test/test_fit_transform.py +++ b/test/test_fit_transform.py @@ -3,9 +3,9 @@ from sklearn.datasets import load_iris from numpy.testing import assert_array_almost_equal -from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, - LSML_Supervised, ITML_Supervised, SDML_Supervised, - RCA_Supervised, MMC_Supervised) +from metric_learn import ( + LMNN, NCA, LFDA, Covariance, MLKR, + LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised) class TestFitTransform(unittest.TestCase): diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index cf857b25..f1e1a09d 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -2,9 +2,9 @@ import unittest from sklearn.utils.estimator_checks import check_estimator -from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, - LSML_Supervised, ITML_Supervised, SDML_Supervised, - RCA_Supervised, MMC_Supervised) +from metric_learn import ( + LMNN, NCA, LFDA, Covariance, MLKR, + LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised) # Wrap the _Supervised methods with a deterministic wrapper for testing. diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 41b31807..fa0fab22 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -409,20 +409,20 @@ def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): def _get_args(function, varargs=False): - """Helper to get function arguments""" - - try: - params = signature(function).parameters - except ValueError: - # Error on builtin C function - return [] - args = [key for key, param in params.items() - if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] - if varargs: - varargs = [param.name for param in params.values() - if param.kind == param.VAR_POSITIONAL] - if len(varargs) == 0: - varargs = None - return args, varargs - else: - return args + """Helper to get function arguments""" + + try: + params = signature(function).parameters + except ValueError: + # Error on builtin C function + return [] + args = [key for key, param in params.items() + if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] + if varargs: + varargs = [param.name for param in params.values() + if param.kind == param.VAR_POSITIONAL] + if len(varargs) == 0: + varargs = None + return args, varargs + else: + return args From 52a1aeca72bf9d610b68447129b36c4f7455b97b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 10:11:45 +0100 Subject: [PATCH 069/120] STY: Fix PEP8 errors --- metric_learn/covariance.py | 1 - metric_learn/itml.py | 2 +- metric_learn/lfda.py | 1 - metric_learn/lmnn.py | 1 - metric_learn/lsml.py | 7 +++---- metric_learn/mlkr.py | 1 - metric_learn/mmc.py | 2 +- test/metric_learn_test.py | 8 ++++---- 8 files changed, 9 insertions(+), 14 deletions(-) diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index ce40dcaa..1292ec18 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -10,7 +10,6 @@ from __future__ import absolute_import import numpy as np -from metric_learn._util import check_input from sklearn.base import TransformerMixin from .base_metric import MahalanobisMixin diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 032665eb..d61f40e5 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -21,7 +21,7 @@ from sklearn.base import TransformerMixin from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import vector_norm, check_input +from ._util import vector_norm class _BaseITML(MahalanobisMixin): diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index cea18882..6696f6aa 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -16,7 +16,6 @@ import warnings from six.moves import xrange from sklearn.metrics import pairwise_distances -from metric_learn._util import check_input from sklearn.base import TransformerMixin from .base_metric import MahalanobisMixin diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 8135f2be..1d9c1ebd 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -14,7 +14,6 @@ import warnings from collections import Counter from six.moves import xrange -from metric_learn._util import check_input from sklearn.metrics import euclidean_distances from sklearn.base import TransformerMixin from .base_metric import MahalanobisMixin diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 483316b9..8c351b71 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -12,7 +12,6 @@ import scipy.linalg from six.moves import xrange from sklearn.base import TransformerMixin -from ._util import check_input from .base_metric import _QuadrupletsClassifierMixin, MahalanobisMixin from .constraints import Constraints @@ -149,9 +148,9 @@ def fit(self, quadruplets, weights=None): ---------- quadruplets : array-like, shape=(n_constraints, 4, n_features) or (n_constraints, 4) - 3D array-like of quadruplets of points or 2D array of quadruplets of - indicators. In order to supervise the algorithm in the right way, we - should have the four samples ordered in a way such that: + 3D array-like of quadruplets of points or 2D array of quadruplets of + indicators. In order to supervise the algorithm in the right way, we + should have the four samples ordered in a way such that: d(pairs[i, 0],X[i, 1]) < d(X[i, 2], X[i, 3]) for all 0 <= i < n_constraints. weights : (n_constraints,) array of floats, optional diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 68095ce3..06d30135 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -14,7 +14,6 @@ from sklearn.decomposition import PCA -from metric_learn._util import check_input from .base_metric import MahalanobisMixin EPS = np.finfo(float).eps diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index b923335e..dea6d9e1 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -24,7 +24,7 @@ from .base_metric import _PairsClassifierMixin, MahalanobisMixin from .constraints import Constraints, wrap_pairs -from ._util import vector_norm, check_input +from ._util import vector_norm class _BaseMMC(MahalanobisMixin): diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 7b3a83a8..d7a1d935 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -158,10 +158,10 @@ def test_iris(self): # Full metric mmc = MMC(convergence_threshold=0.01) mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d])) - expected = [[ 0.000514, 0.000868, -0.001195, -0.001703], - [ 0.000868, 0.001468, -0.002021, -0.002879], - [-0.001195, -0.002021, 0.002782, 0.003964], - [-0.001703, -0.002879, 0.003964, 0.005648]] + expected = [[+0.000514, +0.000868, -0.001195, -0.001703], + [+0.000868, +0.001468, -0.002021, -0.002879], + [-0.001195, -0.002021, +0.002782, +0.003964], + [-0.001703, -0.002879, +0.003964, +0.005648]] assert_array_almost_equal(expected, mmc.metric(), decimal=6) # Diagonal metric From 072e834a7d985b6bf7706fc9db1db57d834aaad0 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 10:38:01 +0100 Subject: [PATCH 070/120] STY: fix indent problems --- metric_learn/_util.py | 82 +++++++++++++++++----------------- metric_learn/base_metric.py | 56 +++++++++++------------ test/test_utils.py | 26 +++++------ test/test_weakly_supervised.py | 74 +++++++++++++++--------------- 4 files changed, 119 insertions(+), 119 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 151f4226..28cc402d 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -53,46 +53,46 @@ def check_input(input, y=None, preprocessor=None, The number of elements in a tuple (e.g. 2 for pairs). dtype : string, type, list of types or None (default="auto") - Data type of result. If None, the dtype of the input is preserved. - If "numeric", dtype is preserved unless array.dtype is object. - If dtype is a list of types, conversion on the first type is only - performed if the dtype of the input is not in the list. If - "auto", will we be set to "numeric" if `preprocessor=True`, - else to None. + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. If + "auto", will we be set to "numeric" if `preprocessor=True`, + else to None. order : 'F', 'C' or None (default=None) - Whether an array will be forced to be fortran or c-style. + Whether an array will be forced to be fortran or c-style. copy : boolean (default=False) - Whether a forced copy will be triggered. If copy=False, a copy might - be triggered by a conversion. + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. This parameter - does not influence whether y can have np.inf or np.nan values. - The possibilities are: - - True: Force all values of X to be finite. + Whether to raise an error on np.inf and np.nan in X. This parameter + does not influence whether y can have np.inf or np.nan values. + The possibilities are: + - True: Force all values of X to be finite. - False: accept both np.inf and np.nan in X. - 'allow-nan': accept only np.nan values in X. Values cannot be infinite. ensure_min_samples : int (default=1) - Make sure that X has a minimum number of samples in its first - axis (rows for a 2D array). + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). ensure_min_features : int (default=1) - Make sure that the 2D array has some minimum number of features - (columns). The default value of 1 rejects empty datasets. - This check is only enforced when X has effectively 2 dimensions or - is originally 1D and ``ensure_2d`` is True. Setting to 0 disables - this check. + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when X has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. warn_on_dtype : boolean (default=False) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. + Raise DataConversionWarning if the dtype of the input data structure + does not match the requested dtype, causing a memory copy. estimator : str or estimator instance (default=None) - If passed, include the name of the estimator in warning messages. + If passed, include the name of the estimator in warning messages. Returns ------- @@ -128,13 +128,13 @@ def check_input(input, y=None, preprocessor=None, if type_of_inputs == 'classic': if input.ndim == 1: - if preprocessor is not None: - input = preprocess_points(input, preprocessor) - preprocessor_has_been_applied = True - else: - make_error_input(101, input, context) + if preprocessor is not None: + input = preprocess_points(input, preprocessor) + preprocessor_has_been_applied = True + else: + make_error_input(101, input, context) elif input.ndim == 2: - pass # OK + pass # OK else: if preprocessor is not None: make_error_input(320, input, context) @@ -153,13 +153,13 @@ def check_input(input, y=None, preprocessor=None, elif type_of_inputs == 'tuples': if input.ndim == 2: if preprocessor is not None: - input = preprocess_tuples(input, preprocessor) - preprocessor_has_been_applied = True + input = preprocess_tuples(input, preprocessor) + preprocessor_has_been_applied = True else: - make_error_input(201, input, context) + make_error_input(201, input, context) elif input.ndim == 3: # we should check_num_features which is not checked # after - pass + pass else: if preprocessor is not None: make_error_input(420, input, context) @@ -171,18 +171,18 @@ def check_input(input, y=None, preprocessor=None, if ensure_min_features > 0: n_features = input.shape[2] if n_features < ensure_min_features: - raise ValueError("Found array with {} feature(s) (shape={}) while" - " a minimum of {} is required{}." - .format(n_features, input.shape, - ensure_min_features, context)) + raise ValueError("Found array with {} feature(s) (shape={}) while" + " a minimum of {} is required{}." + .format(n_features, input.shape, + ensure_min_features, context)) # normally we don't need to check_t too because t should'nt be able to # be modified by any preprocessor if input.ndim != 3: # we have to ensure this because check_array above # does not - if preprocessor_has_been_applied: - make_error_input(211, input, context) - else: - make_error_input(201, input, context) + if preprocessor_has_been_applied: + make_error_input(211, input, context) + else: + make_error_input(201, input, context) check_t(input, t, context) return input if y is None else (input, y) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 2f6954b4..2120704f 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -16,26 +16,26 @@ def __init__(self, preprocessor=None): Parameters ---------- preprocessor : array-like, shape=(n_samples, n_features) or callable - The preprocessor to call to get tuples from indices. If array-like, - tuples will be gotten like this: X[indices]. + The preprocessor to call to get tuples from indices. If array-like, + tuples will be gotten like this: X[indices]. """ self.preprocessor = preprocessor @abstractmethod def score_pairs(self, pairs): - """Returns the score between pairs - (can be a similarity, or a distance/metric depending on the algorithm) + """Returns the score between pairs + (can be a similarity, or a distance/metric depending on the algorithm) - Parameters - ---------- - pairs : `numpy.ndarray`, shape=(n_samples, 2, n_features) - 3D array of pairs. + Parameters + ---------- + pairs : `numpy.ndarray`, shape=(n_samples, 2, n_features) + 3D array of pairs. - Returns - ------- - scores: `numpy.ndarray` of shape=(n_pairs,) - The score of every pair. - """ + Returns + ------- + scores: `numpy.ndarray` of shape=(n_pairs,) + The score of every pair. + """ def check_preprocessor(self): """Initializes the preprocessor""" @@ -135,9 +135,9 @@ def score_pairs(self, pairs): Parameters ---------- pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) - 3D Array of pairs to score, with each row corresponding to two points, - or 2D array of indices of pairs if the metric learner uses a - preprocessor. + 3D Array of pairs to score, with each row corresponding to two points, + for 2D array of indices of pairs if the metric learner uses a + preprocessor. Returns ------- @@ -217,9 +217,9 @@ def predict(self, pairs): Parameters ---------- pairs: array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) - 3D Array of pairs to predict, with each row corresponding to two - points, or 2D array of indices of pairs if the metric learner uses a - preprocessor. + 3D Array of pairs to predict, with each row corresponding to two + points, or 2D array of indices of pairs if the metric learner uses a + preprocessor. Returns ------- @@ -249,9 +249,9 @@ def score(self, pairs, y): Parameters ---------- pairs: array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) - 3D Array of pairs, with each row corresponding to two points, - or 2D array of indices of pairs if the metric learner uses a - preprocessor. + 3D Array of pairs, with each row corresponding to two points, + or 2D array of indices of pairs if the metric learner uses a + preprocessor. y : array-like, shape=(n_constraints,) The corresponding labels. @@ -280,9 +280,9 @@ def predict(self, quadruplets): ---------- quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or (n_quadruplets, 4) - 3D Array of quadruplets to predict, with each row corresponding to four - points, or 2D array of indices of quadruplets if the metric learner - uses a preprocessor. + 3D Array of quadruplets to predict, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. Returns ------- @@ -313,9 +313,9 @@ def score(self, quadruplets, y=None): ---------- quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or (n_quadruplets, 4) - 3D Array of quadruplets to score, with each row corresponding to four - points, or 2D array of indices of quadruplets if the metric learner - uses a preprocessor. + 3D Array of quadruplets to score, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. y : Ignored, for scikit-learn compatibility. diff --git a/test/test_utils.py b/test/test_utils.py index d3c5ea67..431c92be 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -297,17 +297,17 @@ def test_check_input_invalid_complex_data(): @pytest.fixture def points_prep(): - """Basic array for testing when using a preprocessor""" - points = np.array([1, 2]) - return points + """Basic array for testing when using a preprocessor""" + points = np.array([1, 2]) + return points @pytest.fixture def points_no_prep(): - """Basic array for testing when using no preprocessor""" - points = np.array([[1., 2.3], - [2.3, 4.3]]) - return points + """Basic array for testing when using no preprocessor""" + points = np.array([[1., 2.3], + [2.3, 4.3]]) + return points @pytest.mark.parametrize('estimator, context', @@ -556,14 +556,14 @@ def mock_id_loader(list_of_indicators): @pytest.fixture def y_tuples(): - y = [-1, 1] - return y + y = [-1, 1] + return y @pytest.fixture def y_points(): - y = [0, 1, 0, 0] - return y + y = [0, 1, 0, 0] + return y @pytest.mark.parametrize('preprocessor, tuples', zip(preprocessors, @@ -621,8 +621,8 @@ def preprocessor(sequence): return np.ones((len(sequence), 2, 2)) # returns a 3D array instead of 2D with pytest.raises(ValueError) as raised_error: - check_input(np.ones((3,)), type_of_inputs='classic', - preprocessor=preprocessor, estimator=estimator) + check_input(np.ones((3,)), type_of_inputs='classic', + preprocessor=preprocessor, estimator=estimator) expected_msg = ("2D array of formed points expected{}. " "Found 3D array instead:\ninput={}. Reshape your data{}.\n" .format(context, np.ones((3, 2, 2)), diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index fa0fab22..850e6bcc 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -41,20 +41,20 @@ def build_data(): def build_classification(preprocessor): - # test that you can do cross validation on tuples of points with - # a WeaklySupervisedMetricLearner - X, y = shuffle(*make_blobs(), random_state=RNG) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) - return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) + # test that you can do cross validation on tuples of points with + # a WeaklySupervisedMetricLearner + X, y = shuffle(*make_blobs(), random_state=RNG) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) def build_regression(preprocessor): - # test that you can do cross validation on tuples of points with - # a WeaklySupervisedMetricLearner - X, y = shuffle(*make_regression(n_samples=100, n_features=10), - random_state=RNG) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) - return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) + # test that you can do cross validation on tuples of points with + # a WeaklySupervisedMetricLearner + X, y = shuffle(*make_regression(n_samples=100, n_features=10), + random_state=RNG) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) def build_pairs(preprocessor): @@ -175,32 +175,32 @@ def test_simple_estimator(estimator, build_dataset, preprocessor): ids=ids_estimators) @pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) def test_no_attributes_set_in_init(estimator, preprocessor): - """Check setting during init. Adapted from scikit-learn.""" - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - if hasattr(type(estimator).__init__, "deprecated_original"): - return - - init_params = _get_args(type(estimator).__init__) - parents_init_params = [param for params_parent in - (_get_args(parent) for parent in - type(estimator).__mro__) - for param in params_parent] - - # Test for no setting apart from parameters during init - invalid_attr = (set(vars(estimator)) - set(init_params) - - set(parents_init_params)) - assert not invalid_attr, \ - ("Estimator %s should not set any attribute apart" - " from parameters during init. Found attributes %s." - % (type(estimator).__name__, sorted(invalid_attr))) - # Ensure that each parameter is set in init - invalid_attr = (set(init_params) - set(vars(estimator)) - - set(["self"])) - assert not invalid_attr, \ - ("Estimator %s should store all parameters" - " as an attribute during init. Did not find " - "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) + """Check setting during init. Adapted from scikit-learn.""" + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + if hasattr(type(estimator).__init__, "deprecated_original"): + return + + init_params = _get_args(type(estimator).__init__) + parents_init_params = [param for params_parent in + (_get_args(parent) for parent in + type(estimator).__mro__) + for param in params_parent] + + # Test for no setting apart from parameters during init + invalid_attr = (set(vars(estimator)) - set(init_params) - + set(parents_init_params)) + assert not invalid_attr, \ + ("Estimator %s should not set any attribute apart" + " from parameters during init. Found attributes %s." + % (type(estimator).__name__, sorted(invalid_attr))) + # Ensure that each parameter is set in init + invalid_attr = (set(init_params) - set(vars(estimator)) - + set(["self"])) + assert not invalid_attr, \ + ("Estimator %s should store all parameters" + " as an attribute during init. Did not find " + "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) @pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) From 80929e215ed58f46f7ac633e17ea256c86b9de09 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 10:49:50 +0100 Subject: [PATCH 071/120] Fixing docstring spaces --- metric_learn/_util.py | 8 ++++---- metric_learn/lmnn.py | 2 +- metric_learn/mlkr.py | 8 ++++---- metric_learn/rca.py | 4 ++-- metric_learn/sdml.py | 4 ++-- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 28cc402d..326719a8 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -30,22 +30,22 @@ def check_input(input, y=None, preprocessor=None, Parameters ---------- - input: array-like + input : array-like The input data array to check. y : array-like The input labels array to check. - preprocessor: callable (default=None) + preprocessor : callable (default=None) The preprocessor to use. If None, no preprocessor is used. - type_of_inputs: `str` {'classic', 'tuples'} + type_of_inputs : `str` {'classic', 'tuples'} The type of inputs to check. If 'classic', the input should be a 2D array-like of points or a 1D array like of indicators of points. If 'tuples', the input should be a 3D array-like of tuples or a 2D array-like of indicators of tuples. - accept_sparse: `bool` + accept_sparse : `bool` Set to true to allow sparse inputs (only works for sparse inputs with dim < 3). diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 1d9c1ebd..d4ef8cdf 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -31,7 +31,7 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, k : int, optional Number of neighbors to consider, not including self-edges. - regularization: float, optional + regularization : float, optional Weighting of pull and push terms, with 0.5 meaning equal weight. """ self.k = k diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 06d30135..a2c9ece5 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -38,16 +38,16 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, num_dims : int, optional Dimensionality of reduced space (defaults to dimension of X) - A0: array-like, optional + A0 : array-like, optional Initialization of transformation matrix. Defaults to PCA loadings. - epsilon: float, optional + epsilon : float, optional Step size for congujate gradient descent. - alpha: float, optional + alpha : float, optional Stopping criterion for congujate gradient descent. - max_iter: int, optional + max_iter : int, optional Cap on number of congugate gradient iterations. """ self.num_dims = num_dims diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 38d1b408..27d70004 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -158,8 +158,8 @@ def __init__(self, num_dims=None, pca_comps=None, num_chunks=100, ---------- num_dims : int, optional embedding dimension (default: original dimension of data) - num_chunks: int, optional - chunk_size: int, optional + num_chunks : int, optional + chunk_size : int, optional """ RCA.__init__(self, num_dims=num_dims, pca_comps=pca_comps, preprocessor=preprocessor) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index a7011404..054f4bff 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -90,12 +90,12 @@ def fit(self, pairs, y): Parameters ---------- - pairs: array-like, shape=(n_constraints, 2, n_features) or + pairs : array-like, shape=(n_constraints, 2, n_features) or (n_constraints, 2) 3D Array of pairs with each row corresponding to two points, or 2D array of indices of pairs if the metric learner uses a preprocessor. - y: array-like, of shape (n_constraints,) + y : array-like, of shape (n_constraints,) Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. Returns From 40a017243769be719f7d1aec894c4c6a967ca4cf Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 10:54:42 +0100 Subject: [PATCH 072/120] DOC: add preprocessor docstring when missing --- metric_learn/itml.py | 3 +++ metric_learn/lfda.py | 4 ++++ metric_learn/lmnn.py | 4 ++++ metric_learn/lsml.py | 3 +++ metric_learn/mlkr.py | 4 ++++ metric_learn/mmc.py | 3 +++ metric_learn/rca.py | 7 +++++++ metric_learn/sdml.py | 3 +++ 8 files changed, 31 insertions(+) diff --git a/metric_learn/itml.py b/metric_learn/itml.py index d61f40e5..7322c6ab 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -205,6 +205,9 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, initial regularization matrix, defaults to identity verbose : bool, optional if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ _BaseITML.__init__(self, gamma=gamma, max_iter=max_iter, convergence_threshold=convergence_threshold, diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 6696f6aa..fdafcccc 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -50,6 +50,10 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted', 'weighted' - weighted eigenvectors 'orthonormalized' - orthonormalized 'plain' - raw eigenvectors + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. ''' if embedding_type not in ('weighted', 'orthonormalized', 'plain'): raise ValueError('Invalid embedding_type: %r' % embedding_type) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index d4ef8cdf..e13f402f 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -33,6 +33,10 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, regularization : float, optional Weighting of pull and push terms, with 0.5 meaning equal weight. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ self.k = k self.min_iter = min_iter diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 8c351b71..a46bf262 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -193,6 +193,9 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, num_labeled=np.inf, scale factor for each constraint verbose : bool, optional if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ _BaseLSML.__init__(self, tol=tol, max_iter=max_iter, prior=prior, verbose=verbose, preprocessor=preprocessor) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index a2c9ece5..adc26281 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -49,6 +49,10 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, max_iter : int, optional Cap on number of congugate gradient iterations. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ self.num_dims = num_dims self.A0 = A0 diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index dea6d9e1..ea958306 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -430,6 +430,9 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-6, metric learning verbose : bool, optional if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ _BaseMMC.__init__(self, max_iter=max_iter, max_proj=max_proj, convergence_threshold=convergence_threshold, diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 27d70004..400009b1 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -59,6 +59,10 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): If ``0 < pca_comps < 1``, it is used as the minimum explained variance ratio. See sklearn.decomposition.PCA for more details. + + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ self.num_dims = num_dims self.pca_comps = pca_comps @@ -160,6 +164,9 @@ def __init__(self, num_dims=None, pca_comps=None, num_chunks=100, embedding dimension (default: original dimension of data) num_chunks : int, optional chunk_size : int, optional + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ RCA.__init__(self, num_dims=num_dims, pca_comps=pca_comps, preprocessor=preprocessor) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 054f4bff..ba8b0d5d 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -134,6 +134,9 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, number of constraints to generate verbose : bool, optional if True, prints information while learning + preprocessor : array-like, shape=(n_samples, n_features) or callable + The preprocessor to call to get tuples from indices. If array-like, + tuples will be formed like this: X[indices]. """ _BaseSDML.__init__(self, balance_param=balance_param, sparsity_param=sparsity_param, use_cov=use_cov, From 5a3af89ea9d8d2dfa28b6e0c7d5987105c428373 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 11:03:08 +0100 Subject: [PATCH 073/120] STY: PEP8 fixes --- metric_learn/nca.py | 1 - metric_learn/rca.py | 1 - metric_learn/sdml.py | 1 - test/test_weakly_supervised.py | 2 +- 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 635a5986..2627f778 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -7,7 +7,6 @@ import numpy as np from six.moves import xrange from sklearn.base import TransformerMixin -from metric_learn._util import check_input from .base_metric import MahalanobisMixin diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 400009b1..05aabde1 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -17,7 +17,6 @@ from six.moves import xrange from sklearn import decomposition from sklearn.base import TransformerMixin -from metric_learn._util import check_input from .base_metric import MahalanobisMixin from .constraints import Constraints diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index ba8b0d5d..7c267823 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -13,7 +13,6 @@ from sklearn.base import TransformerMixin from sklearn.covariance import graph_lasso from sklearn.utils.extmath import pinvh -from metric_learn._util import check_input from .base_metric import MahalanobisMixin, _PairsClassifierMixin from .constraints import Constraints, wrap_pairs diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 850e6bcc..3e6d9adf 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -52,7 +52,7 @@ def build_regression(preprocessor): # test that you can do cross validation on tuples of points with # a WeaklySupervisedMetricLearner X, y = shuffle(*make_regression(n_samples=100, n_features=10), - random_state=RNG) + random_state=RNG) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) From 968e36ed9c5eda7e7dd243680728bb3e50aef890 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 11:06:32 +0100 Subject: [PATCH 074/120] MAINT: refactor the global check function into _prepare_input --- metric_learn/base_metric.py | 4 ++-- metric_learn/covariance.py | 2 +- metric_learn/itml.py | 6 +++--- metric_learn/lfda.py | 2 +- metric_learn/lmnn.py | 6 +++--- metric_learn/lsml.py | 6 +++--- metric_learn/mlkr.py | 2 +- metric_learn/mmc.py | 6 +++--- metric_learn/nca.py | 2 +- metric_learn/rca.py | 4 ++-- metric_learn/sdml.py | 6 +++--- 11 files changed, 23 insertions(+), 23 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 2120704f..82b0f2af 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -44,8 +44,8 @@ def check_preprocessor(self): else: self.preprocessor_ = self.preprocessor - def initialize_and_check_inputs(self, X, y=None, type_of_inputs='classic', - **kwargs): + def _prepare_inputs(self, X, y=None, type_of_inputs='classic', + **kwargs): """Initializes the preprocessor and processes inputs. See `check_input` for more details. diff --git a/metric_learn/covariance.py b/metric_learn/covariance.py index 1292ec18..a828feb6 100644 --- a/metric_learn/covariance.py +++ b/metric_learn/covariance.py @@ -33,7 +33,7 @@ def fit(self, X, y=None): X : data matrix, (n x d) y : unused """ - self.X_ = self.initialize_and_check_inputs(X, ensure_min_samples=2) + self.X_ = self._prepare_inputs(X, ensure_min_samples=2) self.M_ = np.cov(self.X_, rowvar = False) if self.M_.ndim == 0: self.M_ = 1./self.M_ diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 7322c6ab..990b3c78 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -60,8 +60,8 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, super(_BaseITML, self).__init__(preprocessor) def _process_pairs(self, pairs, y, bounds): - pairs, y = self.initialize_and_check_inputs(pairs, y, - type_of_inputs='tuples') + pairs, y = self._prepare_inputs(pairs, y, + type_of_inputs='tuples') # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -231,7 +231,7 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - X, y = self.initialize_and_check_inputs(X, y, ensure_min_samples=2) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index fdafcccc..4e485f39 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -64,7 +64,7 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted', def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) - self.X_, y = self.initialize_and_check_inputs(X, y) + self.X_, y = self._prepare_inputs(X, y) n, d = self.X_.shape num_classes = len(unique_classes) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index e13f402f..26920206 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -53,8 +53,8 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, class python_LMNN(_base_LMNN): def _process_inputs(self, X, labels): - self.X_, labels = self.initialize_and_check_inputs(X, labels, - ensure_min_samples=2) + self.X_, labels = self._prepare_inputs(X, labels, + ensure_min_samples=2) self.X_ = self.X_.astype(float) # todo: remove the conversion here and # integrate it into check_input num_pts, num_dims = self.X_.shape @@ -259,7 +259,7 @@ class LMNN(_base_LMNN): """ def fit(self, X, y): - self.X_, y = self.initialize_and_check_inputs(X, y, dtype=float) + self.X_, y = self._prepare_inputs(X, y, dtype=float) self.X_ = self.X_, preprocessor=self.preprocessor_ labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index a46bf262..65b7b2a8 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -44,8 +44,8 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, super(_BaseLSML, self).__init__(preprocessor) def _prepare_quadruplets(self, quadruplets, weights): - quadruplets = self.initialize_and_check_inputs(quadruplets, - type_of_inputs='tuples') + quadruplets = self._prepare_inputs(quadruplets, + type_of_inputs='tuples') # check to make sure that no two constrained vectors are identical self.vab_ = quadruplets[:, 0, :] - quadruplets[:, 1, :] @@ -217,7 +217,7 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - X, y = self.initialize_and_check_inputs(X, y, ensure_min_samples=2) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index adc26281..5f22bcf8 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -62,7 +62,7 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, super(MLKR, self).__init__(preprocessor) def _process_inputs(self, X, y): - self.X_, y = self.initialize_and_check_inputs(X, y, y_numeric=True) + self.X_, y = self._prepare_inputs(X, y, y_numeric=True) n, d = self.X_.shape if y.shape[0] != n: raise ValueError('Data and label lengths mismatch: %d != %d' diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index ea958306..5ee11fdb 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -73,8 +73,8 @@ def _fit(self, pairs, y): return self._fit_full(pairs, y) def _process_pairs(self, pairs, y): - pairs, y = self.initialize_and_check_inputs(pairs, y, - type_of_inputs='tuples') + pairs, y = self._prepare_inputs(pairs, y, + type_of_inputs='tuples') # check to make sure that no two constrained vectors are identical pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] @@ -453,7 +453,7 @@ def fit(self, X, y, random_state=np.random): random_state : numpy.random.RandomState, optional If provided, controls random number generation. """ - X, y = self.initialize_and_check_inputs(X, y, ensure_min_samples=2) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 2627f778..94bb9bfe 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -34,7 +34,7 @@ def fit(self, X, y): X: data matrix, (n x d) y: scalar labels, (n) """ - X, labels = self.initialize_and_check_inputs(X, y) + X, labels = self._prepare_inputs(X, y) n, d = X.shape num_dims = self.num_dims if num_dims is None: diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 05aabde1..e7fac6fe 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -68,7 +68,7 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): super(RCA, self).__init__(preprocessor) def _process_data(self, X): - X = self.initialize_and_check_inputs(X) + X = self._prepare_inputs(X) # PCA projection to remove noise and redundant information. if self.pca_comps is not None: @@ -183,7 +183,7 @@ def fit(self, X, y, random_state=np.random): y : (n) data labels random_state : a random.seed object to fix the random_state if needed. """ - X, y = self.initialize_and_check_inputs(X, y) + X, y = self._prepare_inputs(X, y) chunks = Constraints(y).chunks(num_chunks=self.num_chunks, chunk_size=self.chunk_size, random_state=random_state) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 7c267823..2d67d1a4 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -50,8 +50,8 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, super(_BaseSDML, self).__init__(preprocessor) def _prepare_pairs(self, pairs, y): - pairs, y = self.initialize_and_check_inputs(pairs, y, - type_of_inputs='tuples') + pairs, y = self._prepare_inputs(pairs, y, + type_of_inputs='tuples') # set up prior M if self.use_cov: @@ -161,7 +161,7 @@ def fit(self, X, y, random_state=np.random): self : object Returns the instance. """ - X, y = self.initialize_and_check_inputs(X, y) + X, y = self._prepare_inputs(X, y) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) From e5b5f5728d9b836165ccad785b1a89b8b772a6dd Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 11:11:21 +0100 Subject: [PATCH 075/120] FIX: fix quadruplets scoring and delete useless comments --- metric_learn/base_metric.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 82b0f2af..c964eace 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -232,8 +232,6 @@ def predict(self, pairs): return self.score_pairs(pairs) def decision_function(self, pairs): - # no need to check_input since it is done in - # predict->score_pairs return self.predict(pairs) def score(self, pairs, y): @@ -261,8 +259,6 @@ def score(self, pairs, y): score : float The ``roc_auc`` score. """ - # no need to check_input since it is done in - # predict->score_pairs return roc_auc_score(y, self.decision_function(pairs)) @@ -297,10 +293,10 @@ def predict(self, quadruplets): return np.sign(self.decision_function(quadruplets)) def decision_function(self, quadruplets): - # no need to check_input since it is done in - # predict->score_pairs - return (self.score_pairs(quadruplets[:, :2, :]) - - self.score_pairs(quadruplets[:, 2:, :])) + # we broadcast with ... because here we allow quadruplets to be + # either a 3D array of points or 2D array of indices + return (self.score_pairs(quadruplets[:, :2, ...]) - + self.score_pairs(quadruplets[:, 2:, ...])) def score(self, quadruplets, y=None): """Computes score on input quadruplets @@ -324,6 +320,4 @@ def score(self, quadruplets, y=None): score : float The quadruplets score. """ - # no need to check_input since it is done in - # predict->score_pairs return -np.mean(self.predict(quadruplets)) From 84c9d56dfd30d64bc61347956051cb8cd135d8d5 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 11:26:07 +0100 Subject: [PATCH 076/120] MAINT: remove some enhancements to be coherent with previous code and simplify review --- metric_learn/base_metric.py | 4 ++-- metric_learn/lmnn.py | 2 +- metric_learn/mlkr.py | 8 ++++---- metric_learn/rca.py | 4 ++-- metric_learn/sdml.py | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index c964eace..227cf6e2 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -216,7 +216,7 @@ def predict(self, pairs): Parameters ---------- - pairs: array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs to predict, with each row corresponding to two points, or 2D array of indices of pairs if the metric learner uses a preprocessor. @@ -246,7 +246,7 @@ def score(self, pairs, y): Parameters ---------- - pairs: array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) 3D Array of pairs, with each row corresponding to two points, or 2D array of indices of pairs if the metric learner uses a preprocessor. diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 26920206..55f8fadf 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -31,7 +31,7 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, k : int, optional Number of neighbors to consider, not including self-edges. - regularization : float, optional + regularization: float, optional Weighting of pull and push terms, with 0.5 meaning equal weight. preprocessor : array-like, shape=(n_samples, n_features) or callable diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 5f22bcf8..0a92c49a 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -38,16 +38,16 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, num_dims : int, optional Dimensionality of reduced space (defaults to dimension of X) - A0 : array-like, optional + A0: array-like, optional Initialization of transformation matrix. Defaults to PCA loadings. - epsilon : float, optional + epsilon: float, optional Step size for congujate gradient descent. - alpha : float, optional + alpha: float, optional Stopping criterion for congujate gradient descent. - max_iter : int, optional + max_iter: int, optional Cap on number of congugate gradient iterations. preprocessor : array-like, shape=(n_samples, n_features) or callable diff --git a/metric_learn/rca.py b/metric_learn/rca.py index e7fac6fe..f2e3937c 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -161,8 +161,8 @@ def __init__(self, num_dims=None, pca_comps=None, num_chunks=100, ---------- num_dims : int, optional embedding dimension (default: original dimension of data) - num_chunks : int, optional - chunk_size : int, optional + num_chunks: int, optional + chunk_size: int, optional preprocessor : array-like, shape=(n_samples, n_features) or callable The preprocessor to call to get tuples from indices. If array-like, tuples will be formed like this: X[indices]. diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 2d67d1a4..651b3aa4 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -89,12 +89,12 @@ def fit(self, pairs, y): Parameters ---------- - pairs : array-like, shape=(n_constraints, 2, n_features) or + pairs: array-like, shape=(n_constraints, 2, n_features) or (n_constraints, 2) 3D Array of pairs with each row corresponding to two points, or 2D array of indices of pairs if the metric learner uses a preprocessor. - y : array-like, of shape (n_constraints,) + y: array-like, of shape (n_constraints,) Labels of constraints. Should be -1 for dissimilar pair, 1 for similar. Returns From 7605fa4012e0f3573ac5c239c7f2ae8acfc1facb Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 11:52:05 +0100 Subject: [PATCH 077/120] MAINT: Improve test messages --- test/test_utils.py | 73 +++++++++++++++++----------------- test/test_weakly_supervised.py | 19 +++++---- 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 431c92be..24b34693 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -19,7 +19,7 @@ def mock_preprocessor(indices): """ return np.ones((indices.shape[0], 3)) -# ---------------------------- test check_input ---------------------------- +# ---------------- test check_input with 'tuples' type_of_input' ------------ @pytest.fixture @@ -58,7 +58,7 @@ def test_make_name(estimator, expected): [(tuples_prep, mock_preprocessor), (tuples_no_prep, None), (tuples_no_prep, mock_preprocessor)]) -def test_check_input_invalid_t(estimator, context, load_tuples, preprocessor): +def test_check_tuples_invalid_t(estimator, context, load_tuples, preprocessor): """Checks that the exception are raised if t is not the one expected""" tuples = load_tuples() preprocessed_tuples = (preprocess_tuples(tuples, preprocessor) @@ -89,8 +89,8 @@ def test_check_input_invalid_t(estimator, context, load_tuples, preprocessor): ([[[[5]]]], '4', '3D array of formed tuples', None), ([[1], [3]], '2', '3D array of formed ' 'tuples', None)]) -def test_check_input_invalid_shape(estimator, context, tuples, found, - expected, preprocessor): +def test_check_tuples_invalid_shape(estimator, context, tuples, found, + expected, preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (not 2D with preprocessor or 3D with no preprocessor) """ @@ -110,7 +110,7 @@ def test_check_input_invalid_shape(estimator, context, tuples, found, @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -def test_check_input_invalid_n_features(estimator, context, tuples_no_prep): +def test_check_tuples_invalid_n_features(estimator, context, tuples_no_prep): """Checks that the right warning is printed if not enough features Here we only test if no preprocessor (otherwise we don't ensure this) """ @@ -130,8 +130,8 @@ def test_check_input_invalid_n_features(estimator, context, tuples_no_prep): [(tuples_prep, mock_preprocessor), (tuples_no_prep, None), (tuples_no_prep, mock_preprocessor)]) -def test_check_input_invalid_n_samples(estimator, context, load_tuples, - preprocessor): +def test_check_tuples_invalid_n_samples(estimator, context, load_tuples, + preprocessor): """Checks that the right warning is printed if n_samples is too small""" tuples = load_tuples() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " @@ -152,8 +152,8 @@ def test_check_input_invalid_n_samples(estimator, context, load_tuples, [(tuples_prep, mock_preprocessor), (tuples_no_prep, None), (tuples_no_prep, mock_preprocessor)]) -def test_check_input_invalid_dtype_convertible(estimator, context, - load_tuples, preprocessor): +def test_check_tuples_invalid_dtype_convertible(estimator, context, + load_tuples, preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" tuples = load_tuples().astype(object) # here the object conversion is @@ -175,7 +175,7 @@ def preprocessor(indices): # assert str(raised_warning[0].message) == msg -def test_check_input_invalid_dtype_not_convertible_with_preprocessor( +def test_check_tuples_invalid_dtype_not_convertible_with_preprocessor( tuples_prep): """Checks that a value error is thrown if attempting to convert an input not convertible to float, when using a preprocessor @@ -190,7 +190,7 @@ def preprocessor(indices): preprocessor=preprocessor, dtype=np.float64) -def test_check_input_invalid_dtype_not_convertible_without_preprocessor( +def test_check_tuples_invalid_dtype_not_convertible_without_preprocessor( tuples_no_prep): """Checks that a value error is thrown if attempting to convert an input not convertible to float, when using no preprocessor @@ -202,10 +202,10 @@ def test_check_input_invalid_dtype_not_convertible_without_preprocessor( @pytest.mark.parametrize('t', [2, None]) -def test_check_input_valid_t(t, tuples_prep, tuples_no_prep): +def test_check_tuples_valid_t(t, tuples_prep, tuples_no_prep): """For inputs that have the right matrix dimension (2D or 3D for instance), checks that checking the number of tuples (pairs, quadruplets, etc) raises - no warning + no warning if there is the right number of points in a tuple. """ with pytest.warns(None) as record: check_input(tuples_prep, type_of_inputs='tuples', @@ -230,7 +230,7 @@ def test_check_input_valid_t(t, tuples_prep, tuples_no_prep): (1, 4, 9)), np.array([[[1.2, 2.2], [1.4, 3.3]], [[2.6, 2.3], [3.4, 5.0]]])]) -def test_check_input_valid_with_preprocessor(tuples): +def test_check_tuples_valid_with_preprocessor(tuples): """Test that valid inputs when using a preprocessor raises no warning""" with pytest.warns(None) as record: check_input(tuples, type_of_inputs='tuples', @@ -251,14 +251,14 @@ def test_check_input_valid_with_preprocessor(tuples): (((2, 1), (0, 2), (2, 3)), ((1, 2), (4, 4), (9, 3)), ((3, 1), (4, 4), (29, 4)))]) -def test_check_input_valid_without_preprocessor(tuples): +def test_check_tuples_valid_without_preprocessor(tuples): """Test that valid inputs when using no preprocessor raises no warning""" with pytest.warns(None) as record: check_input(tuples, type_of_inputs='tuples', preprocessor=None) assert len(record) == 0 -def test_check_input_behaviour_auto_dtype(tuples_no_prep): +def test_check_tuples_behaviour_auto_dtype(tuples_no_prep): """Checks that check_tuples allows by default every type if using a preprocessor, and numeric types if using no preprocessor""" tuples_prep = [['img1.png', 'img2.png'], ['img3.png', 'img5.png']] @@ -279,7 +279,7 @@ def test_check_input_behaviour_auto_dtype(tuples_no_prep): check_input(tuples_no_prep, type_of_inputs='tuples') -def test_check_input_invalid_complex_data(): +def test_check_tuples_invalid_complex_data(): """Checks that the right error message is thrown if given complex data ( this comes from sklearn's check_array's message)""" tuples = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], @@ -291,8 +291,7 @@ def test_check_input_invalid_complex_data(): assert str(raised_error.value) == msg -# ---------------------------- test check_input with points type -# ---------------------------- +# ------------- test check_input with 'classic' type_of_inputs ---------------- @pytest.fixture @@ -321,8 +320,8 @@ def points_no_prep(): 'array of formed points', mock_preprocessor), ([[[5]]], '3', '2D array of formed points', None)]) -def test_check_input_points_invalid_shape(estimator, context, points, found, - expected, preprocessor): +def test_check_classic_invalid_shape(estimator, context, points, found, + expected, preprocessor): """Checks that a value error with the appropriate message is raised if shape is invalid (valid being 1D or 2D with preprocessor or 2D with no preprocessor) @@ -343,8 +342,8 @@ def test_check_input_points_invalid_shape(estimator, context, points, found, @pytest.mark.parametrize('estimator, context', [(NCA(), " by NCA"), ('NCA', " by NCA"), (None, "")]) -def test_check_input_point_invalid_n_features(estimator, context, - points_no_prep): +def test_check_classic_invalid_n_features(estimator, context, + points_no_prep): """Checks that the right warning is printed if not enough features Here we only test if no preprocessor (otherwise we don't ensure this) """ @@ -364,8 +363,8 @@ def test_check_input_point_invalid_n_features(estimator, context, [(points_prep, mock_preprocessor), (points_no_prep, None), (points_no_prep, mock_preprocessor)]) -def test_check_input_point_invalid_n_samples(estimator, context, load_points, - preprocessor): +def test_check_classic_invalid_n_samples(estimator, context, load_points, + preprocessor): """Checks that the right warning is printed if n_samples is too small""" points = load_points() msg = ("Found array with 2 sample(s) (shape={}) while a minimum of 3 " @@ -388,9 +387,9 @@ def test_check_input_point_invalid_n_samples(estimator, context, load_points, [(points_prep, mock_preprocessor), (points_no_prep, None), (points_no_prep, mock_preprocessor)]) -def test_check_input_point_invalid_dtype_convertible(estimator, context, - load_points, - preprocessor): +def test_check_classic_invalid_dtype_convertible(estimator, context, + load_points, + preprocessor): """Checks that a warning is raised if a convertible input is converted to float""" points = load_points().astype(object) # here the object conversion is @@ -417,7 +416,7 @@ def preprocessor(indices): ['e', 'b']])), (None, np.array([[['b', 'v'], ['a', 'd']], [['x', 'u'], ['c', 'a']]]))]) -def test_check_input_point_invalid_dtype_not_convertible(preprocessor, points): +def test_check_classic_invalid_dtype_not_convertible(preprocessor, points): """Checks that a value error is thrown if attempting to convert an input not convertible to float """ @@ -435,7 +434,7 @@ def test_check_input_point_invalid_dtype_not_convertible(preprocessor, points): (2, 0, 2), np.array([[1.2, 2.2], [2.6, 2.3]])]) -def test_check_input_point_valid_with_preprocessor(points): +def test_check_classic_valid_with_preprocessor(points): """Test that valid inputs when using a preprocessor raises no warning""" with pytest.warns(None) as record: check_input(points, type_of_inputs='classic', @@ -456,14 +455,14 @@ def test_check_input_point_valid_with_preprocessor(points): ((2, 1, 0, 2, 2, 3), (1, 2, 4, 4, 9, 3), (3, 1, 4, 4, 29, 4))]) -def test_check_input_point_valid_without_preprocessor(points): +def test_check_classic_valid_without_preprocessor(points): """Test that valid inputs when using no preprocessor raises no warning""" with pytest.warns(None) as record: check_input(points, type_of_inputs='classic', preprocessor=None) assert len(record) == 0 -def test_check_input_point_behaviour_auto_dtype(points_no_prep): +def test_check_classic_behaviour_auto_dtype(points_no_prep): """Checks that check_input (for points) allows by default every type if using a preprocessor, and numeric types if using no preprocessor""" points_prep = ['img1.png', 'img2.png', 'img3.png', 'img5.png'] @@ -484,7 +483,7 @@ def test_check_input_point_behaviour_auto_dtype(points_no_prep): check_input(points_no_prep, type_of_inputs='classic') -def test_check_input_point_invalid_complex_data(): +def test_check_classic_invalid_complex_data(): """Checks that the right error message is thrown if given complex data ( this comes from sklearn's check_array's message)""" points = np.array([[[1 + 2j, 3 + 4j], [5 + 7j, 5 + 7j]], @@ -612,7 +611,7 @@ def preprocessor(sequence): @pytest.mark.parametrize('estimator', ['NCA', NCA(), None]) def test_preprocess_points_invalid_message(estimator): """Checks that if the preprocessor does some weird stuff, the preprocessed - input is detected as weird. Checks this for preprocess_points.""" + input is detected as weird.""" context = make_context(estimator) + (' after the preprocessor ' 'has been applied') @@ -645,7 +644,7 @@ def fun(row): def test_progress_message_preprocessor_tuples(capsys): - """Tests that when using a preprocessor on points, a message is printed + """Tests that when using a preprocessor on tuples, a message is printed """ tuples = np.array([[1, 2], [2, 3], @@ -794,7 +793,9 @@ def build_regression(rng): @pytest.mark.parametrize('estimator, dataset', estimators, ids=ids_estimators) def test_same_with_or_without_preprocessor(estimator, dataset): - + """test that supervised algorithms using a preprocessor behave consistently + with their no-preprocessor equivalent. + """ (formed_points_train, formed_points_test, y_train, y_test, points_indicators_train, points_indicators_test) = train_test_split(dataset.formed_points, diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py index 3e6d9adf..e4600233 100644 --- a/test/test_weakly_supervised.py +++ b/test/test_weakly_supervised.py @@ -41,16 +41,14 @@ def build_data(): def build_classification(preprocessor): - # test that you can do cross validation on tuples of points with - # a WeaklySupervisedMetricLearner + # builds a toy classification problem X, y = shuffle(*make_blobs(), random_state=RNG) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) def build_regression(preprocessor): - # test that you can do cross validation on tuples of points with - # a WeaklySupervisedMetricLearner + # builds a toy regression problem X, y = shuffle(*make_regression(n_samples=100, n_features=10), random_state=RNG) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) @@ -58,8 +56,7 @@ def build_regression(preprocessor): def build_pairs(preprocessor): - # test that you can do cross validation on tuples of points with - # a WeaklySupervisedMetricLearner + # builds a toy pairs problem X, indices = build_data() if preprocessor is not None: # if preprocessor, we build a 2D array of pairs of indices @@ -77,8 +74,7 @@ def build_pairs(preprocessor): def build_quadruplets(preprocessor): - # test that you can do cross validation on a tuples of points with - # a WeaklySupervisedMetricLearner + # builds a toy quadruplets problem X, indices = build_data() c = np.column_stack(indices) if preprocessor is not None: @@ -131,6 +127,8 @@ def build_quadruplets(preprocessor): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_cross_validation(estimator, build_dataset, preprocessor): + """Tests that you can do cross validation on metric-learn estimators + """ if any(hasattr(estimator, method) for method in ["predict", "score"]): (X, tuples, y, tuples_train, tuples_test, y_train, y_test, preprocessor) = build_dataset(preprocessor) @@ -159,6 +157,8 @@ def check_predict(estimator, tuples): @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_simple_estimator(estimator, build_dataset, preprocessor): + """Tests that fit, predict and scoring works. + """ if any(hasattr(estimator, method) for method in ["predict", "score"]): (X, tuples, y, tuples_train, tuples_test, y_train, y_test, preprocessor) = build_dataset(preprocessor) @@ -293,6 +293,9 @@ def test_dict_unchanged(estimator, build_dataset, preprocessor): (SDML(), build_pairs)], ids=['itml', 'lsml', 'mmc', 'sdml']) def test_same_result_with_or_without_preprocessor(estimator, build_dataset): + """For weakly supervised algorithms, test that using a preprocessor or not + (with the appropriate corresponding inputs) give the same result. + """ (X, tuples, y, tuples_train, tuples_test, y_train, y_test, _) = build_dataset(preprocessor=mock_preprocessor) formed_tuples_train = X[tuples_train] From 69de333078c26fe4ac2f7768cc41c40112f762f9 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 13 Nov 2018 16:39:43 +0100 Subject: [PATCH 078/120] MAINT: reorganize tests --- test/test_sklearn_compat.py | 359 ++++++++++++++++++++++++++- test/test_utils.py | 85 ++++++- test/test_weakly_supervised.py | 431 --------------------------------- 3 files changed, 438 insertions(+), 437 deletions(-) delete mode 100644 test/test_weakly_supervised.py diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index f1e1a09d..946db78c 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -1,10 +1,23 @@ -import numpy as np +import pytest import unittest from sklearn.utils.estimator_checks import check_estimator +from sklearn.base import TransformerMixin +from sklearn.datasets import load_iris, make_regression, make_blobs +from sklearn.pipeline import make_pipeline +from sklearn.utils import shuffle, check_random_state +from sklearn.utils.estimator_checks import is_public_parameter +from sklearn.utils.testing import (assert_allclose_dense_sparse, + set_random_state) +from sklearn.utils.fixes import signature -from metric_learn import ( - LMNN, NCA, LFDA, Covariance, MLKR, - LSML_Supervised, ITML_Supervised, SDML_Supervised, RCA_Supervised, MMC_Supervised) +from metric_learn import (Covariance, ITML, LFDA, LMNN, LSML, MLKR, MMC, NCA, + RCA, SDML, ITML_Supervised, LSML_Supervised, + MMC_Supervised, RCA_Supervised, SDML_Supervised) +from metric_learn.constraints import wrap_pairs, Constraints +from sklearn import clone +import numpy as np +from sklearn.model_selection import (cross_val_score, cross_val_predict, + train_test_split) # Wrap the _Supervised methods with a deterministic wrapper for testing. @@ -68,5 +81,343 @@ def test_mmc(self): # check_estimator(RCA_Supervised) +RNG = check_random_state(0) + + +def mock_preprocessor(indices): + """A preprocessor for testing purposes that returns an all ones 3D array + """ + return np.ones((indices.shape[0], 3)) + + +# ---------------------- Test scikit-learn compatibility ---------------------- + + +def build_data(): + dataset = load_iris() + X, y = shuffle(dataset.data, dataset.target, random_state=RNG) + num_constraints = 50 + constraints = Constraints.random_subset(y, random_state=RNG) + pairs = constraints.positive_negative_pairs(num_constraints, + same_length=True, + random_state=RNG) + return X, pairs + + +def build_classification(preprocessor): + # builds a toy classification problem + X, y = shuffle(*make_blobs(), random_state=RNG) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) + + +def build_regression(preprocessor): + # builds a toy regression problem + X, y = shuffle(*make_regression(n_samples=100, n_features=10), + random_state=RNG) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) + + +def build_pairs(preprocessor): + # builds a toy pairs problem + X, indices = build_data() + if preprocessor is not None: + # if preprocessor, we build a 2D array of pairs of indices + _, y = wrap_pairs(X, indices) + pairs = np.vstack([np.column_stack(indices[:2]), + np.column_stack(indices[2:])]) + else: + # if not, we build a 3D array of pairs of samples + pairs, y = wrap_pairs(X, indices) + pairs, y = shuffle(pairs, y, random_state=RNG) + (pairs_train, pairs_test, y_train, + y_test) = train_test_split(pairs, y, random_state=RNG) + return (X, pairs, y, pairs_train, pairs_test, + y_train, y_test, preprocessor) + + +def build_quadruplets(preprocessor): + # builds a toy quadruplets problem + X, indices = build_data() + c = np.column_stack(indices) + if preprocessor is not None: + # if preprocessor, we build a 2D array of quadruplets of indices + quadruplets = c + else: + # if not, we build a 3D array of quadruplets of samples + quadruplets = X[c] + quadruplets = shuffle(quadruplets, random_state=RNG) + y = y_train = y_test = None + quadruplets_train, quadruplets_test = train_test_split(quadruplets, + random_state=RNG) + return (X, quadruplets, y, quadruplets_train, quadruplets_test, + y_train, y_test, preprocessor) + + +list_estimators = [(Covariance(), build_classification), + (ITML(), build_pairs), + (LFDA(), build_classification), + (LMNN(), build_classification), + (LSML(), build_quadruplets), + (MLKR(), build_regression), + (MMC(max_iter=2), build_pairs), # max_iter=2 for faster + # testing + (NCA(), build_classification), + (RCA(), build_classification), + (SDML(), build_pairs), + (ITML_Supervised(), build_classification), + (LSML_Supervised(), build_classification), + (MMC_Supervised(), build_classification), + (RCA_Supervised(num_chunks=10), build_classification), + (SDML_Supervised(), build_classification) + ] + +ids_estimators = ['covariance', + 'itml', + 'lfda', + 'lmnn', + 'lsml', + 'mlkr', + 'mmc', + 'nca', + 'rca', + 'sdml', + 'itml_supervised', + 'lsml_supervised', + 'mmc_supervised', + 'rca_supervised', + 'sdml_supervised' + ] + + +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_cross_validation(estimator, build_dataset, preprocessor): + """Tests that you can do cross validation on metric-learn estimators + """ + if any(hasattr(estimator, method) for method in ["predict", "score"]): + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + if hasattr(estimator, "score"): + assert np.isfinite(cross_val_score(estimator, tuples, y)).all() + if hasattr(estimator, "predict"): + assert np.isfinite(cross_val_predict(estimator, tuples, y)).all() + + +def check_score(estimator, tuples, y): + if hasattr(estimator, "score"): + score = estimator.score(tuples, y) + assert np.isfinite(score) + + +def check_predict(estimator, tuples): + if hasattr(estimator, "predict"): + y_predicted = estimator.predict(tuples) + assert len(y_predicted), len(tuples) + + +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_simple_estimator(estimator, build_dataset, preprocessor): + """Tests that fit, predict and scoring works. + """ + if any(hasattr(estimator, method) for method in ["predict", "score"]): + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + + estimator.fit(tuples_train, y_train) + check_score(estimator, tuples_test, y_test) + check_predict(estimator, tuples_test) + + +@pytest.mark.parametrize('estimator', [est[0] for est in list_estimators], + ids=ids_estimators) +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +def test_no_attributes_set_in_init(estimator, preprocessor): + """Check setting during init. Adapted from scikit-learn.""" + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + if hasattr(type(estimator).__init__, "deprecated_original"): + return + + init_params = _get_args(type(estimator).__init__) + parents_init_params = [param for params_parent in + (_get_args(parent) for parent in + type(estimator).__mro__) + for param in params_parent] + + # Test for no setting apart from parameters during init + invalid_attr = (set(vars(estimator)) - set(init_params) - + set(parents_init_params)) + assert not invalid_attr, \ + ("Estimator %s should not set any attribute apart" + " from parameters during init. Found attributes %s." + % (type(estimator).__name__, sorted(invalid_attr))) + # Ensure that each parameter is set in init + invalid_attr = (set(init_params) - set(vars(estimator)) - + set(["self"])) + assert not invalid_attr, \ + ("Estimator %s should store all parameters" + " as an attribute during init. Did not find " + "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) + + +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_estimators_fit_returns_self(estimator, build_dataset, preprocessor): + """Check if self is returned when calling fit""" + # Adapted from scikit-learn + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + assert estimator.fit(tuples, y) is estimator + + +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_pipeline_consistency(estimator, build_dataset, preprocessor): + # Adapted from scikit learn + # check that make_pipeline(est) gives same score as est + (_, inputs, y, _, _, _, _, preprocessor) = build_dataset(preprocessor) + + def make_random_state(estimator, in_pipeline): + rs = {} + name_estimator = estimator.__class__.__name__ + if name_estimator[-11:] == '_Supervised': + name_param = 'random_state' + if in_pipeline: + name_param = name_estimator.lower() + '__' + name_param + rs[name_param] = check_random_state(0) + return rs + + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + pipeline = make_pipeline(estimator) + estimator.fit(inputs, y, **make_random_state(estimator, False)) + pipeline.fit(inputs, y, **make_random_state(estimator, True)) + + if hasattr(estimator, 'score'): + result = estimator.score(inputs, y) + result_pipe = pipeline.score(inputs, y) + assert_allclose_dense_sparse(result, result_pipe) + + if hasattr(estimator, 'predict'): + result = estimator.predict(inputs) + result_pipe = pipeline.predict(inputs) + assert_allclose_dense_sparse(result, result_pipe) + + if issubclass(estimator.__class__, TransformerMixin): + if hasattr(estimator, 'transform'): + result = estimator.transform(inputs) + result_pipe = pipeline.transform(inputs) + assert_allclose_dense_sparse(result, result_pipe) + + +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_dict_unchanged(estimator, build_dataset, preprocessor): + # Adapted from scikit-learn + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + if hasattr(estimator, "num_dims"): + estimator.num_dims = 1 + estimator.fit(tuples, y) + for method in ["predict", "decision_function", "predict_proba"]: + if hasattr(estimator, method): + dict_before = estimator.__dict__.copy() + getattr(estimator, method)(tuples) + assert estimator.__dict__ == dict_before, \ + ("Estimator changes __dict__ during %s" + % method) + for method in ["transform"]: + if hasattr(estimator, method): + dict_before = estimator.__dict__.copy() + # we transform only 2D arrays (dataset of points) + getattr(estimator, method)(X) + assert estimator.__dict__ == dict_before, \ + ("Estimator changes __dict__ during %s" + % method) + + +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): + # Adapted from scikit-learn + # check that fit method only changes or sets private attributes + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + if hasattr(estimator, "num_dims"): + estimator.num_dims = 1 + dict_before_fit = estimator.__dict__.copy() + + estimator.fit(tuples, y) + dict_after_fit = estimator.__dict__ + + public_keys_after_fit = [key for key in dict_after_fit.keys() + if is_public_parameter(key)] + + attrs_added_by_fit = [key for key in public_keys_after_fit + if key not in dict_before_fit.keys()] + + # check that fit doesn't add any public attribute + assert not attrs_added_by_fit, ( + "Estimator adds public attribute(s) during" + " the fit method." + " Estimators are only allowed to add private " + "attributes" + " either started with _ or ended" + " with _ but %s added" % ', '.join(attrs_added_by_fit)) + + # check that fit doesn't change any public attribute + attrs_changed_by_fit = [key for key in public_keys_after_fit + if (dict_before_fit[key] + is not dict_after_fit[key])] + + assert not attrs_changed_by_fit, ( + "Estimator changes public attribute(s) during" + " the fit method. Estimators are only allowed" + " to change attributes started" + " or ended with _, but" + " %s changed" % ', '.join(attrs_changed_by_fit)) + + +def _get_args(function, varargs=False): + """Helper to get function arguments""" + + try: + params = signature(function).parameters + except ValueError: + # Error on builtin C function + return [] + args = [key for key, param in params.items() + if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] + if varargs: + varargs = [param.name for param in params.values() + if param.kind == param.VAR_POSITIONAL] + if len(varargs) == 0: + varargs = None + return args, varargs + else: + return args + + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 24b34693..6fade133 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -12,6 +12,7 @@ LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised) from sklearn.datasets import make_regression, make_blobs +from .test_sklearn_compat import build_pairs, build_quadruplets def mock_preprocessor(indices): @@ -792,8 +793,8 @@ def build_regression(rng): @pytest.mark.parametrize('estimator, dataset', estimators, ids=ids_estimators) -def test_same_with_or_without_preprocessor(estimator, dataset): - """test that supervised algorithms using a preprocessor behave consistently +def test_same_with_or_without_preprocessor_classic(estimator, dataset): + """Test that supervised algorithms using a preprocessor behave consistently with their no-preprocessor equivalent. """ (formed_points_train, formed_points_test, @@ -846,3 +847,83 @@ def make_random_state(estimator): points_indicators_test[np.array([[0, 2], [5, 3]])]) == estimator_with_prep_formed.score_pairs( formed_points_test[np.array([[0, 2], [5, 3]])])).all() + + +@pytest.mark.parametrize('estimator, build_dataset', + [(ITML(), build_pairs), + (LSML(), build_quadruplets), + (MMC(max_iter=2), build_pairs), + (SDML(), build_pairs)], + ids=['itml', 'lsml', 'mmc', 'sdml']) +def test_same_with_or_without_preprocessor_tuples(estimator, build_dataset): + """For weakly supervised algorithms, test that using a preprocessor or not + (with the appropriate corresponding inputs) give the same result. + """ + (X, tuples, y, tuples_train, tuples_test, y_train, + y_test, _) = build_dataset(preprocessor=mock_preprocessor) + formed_tuples_train = X[tuples_train] + formed_tuples_test = X[tuples_test] + + estimator_with_preprocessor = clone(estimator) + set_random_state(estimator_with_preprocessor) + estimator_with_preprocessor.set_params(preprocessor=X) + if estimator.__class__.__name__ == 'LSML': + estimator_with_preprocessor.fit(tuples_train) + else: + estimator_with_preprocessor.fit(tuples_train, y_train) + + estimator_without_preprocessor = clone(estimator) + set_random_state(estimator_without_preprocessor) + estimator_without_preprocessor.set_params(preprocessor=None) + if estimator.__class__.__name__ == 'LSML': + estimator_without_preprocessor.fit(formed_tuples_train) + else: + estimator_without_preprocessor.fit(formed_tuples_train, y_train) + + estimator_with_prep_formed = clone(estimator) + set_random_state(estimator_with_prep_formed) + estimator_with_prep_formed.set_params(preprocessor=X) + if estimator.__class__.__name__ == 'LSML': + estimator_with_prep_formed.fit(tuples_train) + else: + estimator_with_prep_formed.fit(tuples_train, y_train) + + # test prediction methods + for method in ["predict", "decision_function"]: + if hasattr(estimator, method): + output_with_prep = getattr(estimator_with_preprocessor, + method)(tuples_test) + output_without_prep = getattr(estimator_without_preprocessor, + method)(formed_tuples_test) + assert np.array(output_with_prep == output_without_prep).all() + output_with_prep = getattr(estimator_with_preprocessor, + method)(tuples_test) + output_with_prep_formed = getattr(estimator_with_prep_formed, + method)(formed_tuples_test) + assert np.array(output_with_prep == output_with_prep_formed).all() + + # test score_pairs + output_with_prep = estimator_with_preprocessor.score_pairs( + tuples_test[:, :2]) + output_without_prep = estimator_without_preprocessor.score_pairs( + formed_tuples_test[:, :2]) + assert np.array(output_with_prep == output_without_prep).all() + + output_with_prep = estimator_with_preprocessor.score_pairs( + tuples_test[:, :2]) + output_without_prep = estimator_with_prep_formed.score_pairs( + formed_tuples_test[:, :2]) + assert np.array(output_with_prep == output_without_prep).all() + + # test transform + output_with_prep = estimator_with_preprocessor.transform( + tuples_test[:, 0]) + output_without_prep = estimator_without_preprocessor.transform( + formed_tuples_test[:, 0]) + assert np.array(output_with_prep == output_without_prep).all() + + output_with_prep = estimator_with_preprocessor.transform( + tuples_test[:, 0]) + output_without_prep = estimator_with_prep_formed.transform( + formed_tuples_test[:, 0]) + assert np.array(output_with_prep == output_without_prep).all() diff --git a/test/test_weakly_supervised.py b/test/test_weakly_supervised.py deleted file mode 100644 index e4600233..00000000 --- a/test/test_weakly_supervised.py +++ /dev/null @@ -1,431 +0,0 @@ -import pytest -from sklearn.base import TransformerMixin -from sklearn.datasets import load_iris, make_regression, make_blobs -from sklearn.pipeline import make_pipeline -from sklearn.utils import shuffle, check_random_state -from sklearn.utils.estimator_checks import is_public_parameter -from sklearn.utils.testing import (assert_allclose_dense_sparse, - set_random_state) -from sklearn.utils.fixes import signature - -from metric_learn import (ITML, LFDA, LMNN, LSML, MLKR, MMC, NCA, RCA, SDML, - ITML_Supervised, LSML_Supervised, MMC_Supervised, - SDML_Supervised) -from metric_learn.constraints import wrap_pairs, Constraints -from sklearn import clone -import numpy as np -from sklearn.model_selection import (cross_val_score, cross_val_predict, - train_test_split) - -RNG = check_random_state(0) - - -def mock_preprocessor(indices): - """A preprocessor for testing purposes that returns an all ones 3D array - """ - return np.ones((indices.shape[0], 3)) - - -# ---------------------- Test scikit-learn compatibility ---------------------- - - -def build_data(): - dataset = load_iris() - X, y = shuffle(dataset.data, dataset.target, random_state=RNG) - num_constraints = 50 - constraints = Constraints.random_subset(y, random_state=RNG) - pairs = constraints.positive_negative_pairs(num_constraints, - same_length=True, - random_state=RNG) - return X, pairs - - -def build_classification(preprocessor): - # builds a toy classification problem - X, y = shuffle(*make_blobs(), random_state=RNG) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) - return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) - - -def build_regression(preprocessor): - # builds a toy regression problem - X, y = shuffle(*make_regression(n_samples=100, n_features=10), - random_state=RNG) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) - return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) - - -def build_pairs(preprocessor): - # builds a toy pairs problem - X, indices = build_data() - if preprocessor is not None: - # if preprocessor, we build a 2D array of pairs of indices - _, y = wrap_pairs(X, indices) - pairs = np.vstack([np.column_stack(indices[:2]), - np.column_stack(indices[2:])]) - else: - # if not, we build a 3D array of pairs of samples - pairs, y = wrap_pairs(X, indices) - pairs, y = shuffle(pairs, y, random_state=RNG) - (pairs_train, pairs_test, y_train, - y_test) = train_test_split(pairs, y, random_state=RNG) - return (X, pairs, y, pairs_train, pairs_test, - y_train, y_test, preprocessor) - - -def build_quadruplets(preprocessor): - # builds a toy quadruplets problem - X, indices = build_data() - c = np.column_stack(indices) - if preprocessor is not None: - # if preprocessor, we build a 2D array of quadruplets of indices - quadruplets = c - else: - # if not, we build a 3D array of quadruplets of samples - quadruplets = X[c] - quadruplets = shuffle(quadruplets, random_state=RNG) - y = y_train = y_test = None - quadruplets_train, quadruplets_test = train_test_split(quadruplets, - random_state=RNG) - return (X, quadruplets, y, quadruplets_train, quadruplets_test, - y_train, y_test, preprocessor) - - -list_estimators = [(ITML(), build_pairs), - (LFDA(), build_classification), - (LMNN(), build_classification), - (LSML(), build_quadruplets), - (MLKR(), build_regression), - (MMC(max_iter=2), build_pairs), # max_iter=2 for faster - # testing - (NCA(), build_classification), - (RCA(), build_classification), - (SDML(), build_pairs), - (ITML_Supervised(), build_classification), - (LSML_Supervised(), build_classification), - (MMC_Supervised(), build_classification), - (SDML_Supervised(), build_classification) - ] - -ids_estimators = ['itml', - 'lfda', - 'lmnn', - 'lsml', - 'mlkr', - 'mmc', - 'nca', - 'rca', - 'sdml', - 'itml_supervised', - 'lsml_supervised', - 'mmc_supervised', - 'sdml_supervised' - ] - - -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) -def test_cross_validation(estimator, build_dataset, preprocessor): - """Tests that you can do cross validation on metric-learn estimators - """ - if any(hasattr(estimator, method) for method in ["predict", "score"]): - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - set_random_state(estimator) - if hasattr(estimator, "score"): - assert np.isfinite(cross_val_score(estimator, tuples, y)).all() - if hasattr(estimator, "predict"): - assert np.isfinite(cross_val_predict(estimator, tuples, y)).all() - - -def check_score(estimator, tuples, y): - if hasattr(estimator, "score"): - score = estimator.score(tuples, y) - assert np.isfinite(score) - - -def check_predict(estimator, tuples): - if hasattr(estimator, "predict"): - y_predicted = estimator.predict(tuples) - assert len(y_predicted), len(tuples) - - -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) -def test_simple_estimator(estimator, build_dataset, preprocessor): - """Tests that fit, predict and scoring works. - """ - if any(hasattr(estimator, method) for method in ["predict", "score"]): - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - set_random_state(estimator) - - estimator.fit(tuples_train, y_train) - check_score(estimator, tuples_test, y_test) - check_predict(estimator, tuples_test) - - -@pytest.mark.parametrize('estimator', [est[0] for est in list_estimators], - ids=ids_estimators) -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) -def test_no_attributes_set_in_init(estimator, preprocessor): - """Check setting during init. Adapted from scikit-learn.""" - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - if hasattr(type(estimator).__init__, "deprecated_original"): - return - - init_params = _get_args(type(estimator).__init__) - parents_init_params = [param for params_parent in - (_get_args(parent) for parent in - type(estimator).__mro__) - for param in params_parent] - - # Test for no setting apart from parameters during init - invalid_attr = (set(vars(estimator)) - set(init_params) - - set(parents_init_params)) - assert not invalid_attr, \ - ("Estimator %s should not set any attribute apart" - " from parameters during init. Found attributes %s." - % (type(estimator).__name__, sorted(invalid_attr))) - # Ensure that each parameter is set in init - invalid_attr = (set(init_params) - set(vars(estimator)) - - set(["self"])) - assert not invalid_attr, \ - ("Estimator %s should store all parameters" - " as an attribute during init. Did not find " - "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) - - -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) -def test_estimators_fit_returns_self(estimator, build_dataset, preprocessor): - """Check if self is returned when calling fit""" - # Adapted from scikit-learn - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - assert estimator.fit(tuples, y) is estimator - - -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) -def test_pipeline_consistency(estimator, build_dataset, preprocessor): - # Adapted from scikit learn - # check that make_pipeline(est) gives same score as est - (_, inputs, y, _, _, _, _, preprocessor) = build_dataset(preprocessor) - - def make_random_state(estimator, in_pipeline): - rs = {} - name_estimator = estimator.__class__.__name__ - if name_estimator[-11:] == '_Supervised': - name_param = 'random_state' - if in_pipeline: - name_param = name_estimator.lower() + '__' + name_param - rs[name_param] = check_random_state(0) - return rs - - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - pipeline = make_pipeline(estimator) - estimator.fit(inputs, y, **make_random_state(estimator, False)) - pipeline.fit(inputs, y, **make_random_state(estimator, True)) - - if hasattr(estimator, 'score'): - result = estimator.score(inputs, y) - result_pipe = pipeline.score(inputs, y) - assert_allclose_dense_sparse(result, result_pipe) - - if hasattr(estimator, 'predict'): - result = estimator.predict(inputs) - result_pipe = pipeline.predict(inputs) - assert_allclose_dense_sparse(result, result_pipe) - - if issubclass(estimator.__class__, TransformerMixin): - if hasattr(estimator, 'transform'): - result = estimator.transform(inputs) - result_pipe = pipeline.transform(inputs) - assert_allclose_dense_sparse(result, result_pipe) - - -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) -def test_dict_unchanged(estimator, build_dataset, preprocessor): - # Adapted from scikit-learn - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - if hasattr(estimator, "num_dims"): - estimator.num_dims = 1 - estimator.fit(tuples, y) - for method in ["predict", "decision_function", "predict_proba"]: - if hasattr(estimator, method): - dict_before = estimator.__dict__.copy() - getattr(estimator, method)(tuples) - assert estimator.__dict__ == dict_before, \ - ("Estimator changes __dict__ during %s" - % method) - for method in ["transform"]: - if hasattr(estimator, method): - dict_before = estimator.__dict__.copy() - # we transform only 2D arrays (dataset of points) - getattr(estimator, method)(X) - assert estimator.__dict__ == dict_before, \ - ("Estimator changes __dict__ during %s" - % method) - - -@pytest.mark.parametrize('estimator, build_dataset', - [(ITML(), build_pairs), - (LSML(), build_quadruplets), - (MMC(max_iter=2), build_pairs), - (SDML(), build_pairs)], - ids=['itml', 'lsml', 'mmc', 'sdml']) -def test_same_result_with_or_without_preprocessor(estimator, build_dataset): - """For weakly supervised algorithms, test that using a preprocessor or not - (with the appropriate corresponding inputs) give the same result. - """ - (X, tuples, y, tuples_train, tuples_test, y_train, - y_test, _) = build_dataset(preprocessor=mock_preprocessor) - formed_tuples_train = X[tuples_train] - formed_tuples_test = X[tuples_test] - - estimator_with_preprocessor = clone(estimator) - set_random_state(estimator_with_preprocessor) - estimator_with_preprocessor.set_params(preprocessor=X) - if estimator.__class__.__name__ == 'LSML': - estimator_with_preprocessor.fit(tuples_train) - else: - estimator_with_preprocessor.fit(tuples_train, y_train) - - estimator_without_preprocessor = clone(estimator) - set_random_state(estimator_without_preprocessor) - estimator_without_preprocessor.set_params(preprocessor=None) - if estimator.__class__.__name__ == 'LSML': - estimator_without_preprocessor.fit(formed_tuples_train) - else: - estimator_without_preprocessor.fit(formed_tuples_train, y_train) - - estimator_with_prep_formed = clone(estimator) - set_random_state(estimator_with_prep_formed) - estimator_with_prep_formed.set_params(preprocessor=X) - if estimator.__class__.__name__ == 'LSML': - estimator_with_prep_formed.fit(tuples_train) - else: - estimator_with_prep_formed.fit(tuples_train, y_train) - - # test prediction methods - for method in ["predict", "decision_function"]: - if hasattr(estimator, method): - output_with_prep = getattr(estimator_with_preprocessor, - method)(tuples_test) - output_without_prep = getattr(estimator_without_preprocessor, - method)(formed_tuples_test) - assert np.array(output_with_prep == output_without_prep).all() - output_with_prep = getattr(estimator_with_preprocessor, - method)(tuples_test) - output_with_prep_formed = getattr(estimator_with_prep_formed, - method)(formed_tuples_test) - assert np.array(output_with_prep == output_with_prep_formed).all() - - # test score_pairs - output_with_prep = estimator_with_preprocessor.score_pairs( - tuples_test[:, :2]) - output_without_prep = estimator_without_preprocessor.score_pairs( - formed_tuples_test[:, :2]) - assert np.array(output_with_prep == output_without_prep).all() - - output_with_prep = estimator_with_preprocessor.score_pairs( - tuples_test[:, :2]) - output_without_prep = estimator_with_prep_formed.score_pairs( - formed_tuples_test[:, :2]) - assert np.array(output_with_prep == output_without_prep).all() - - # test transform - output_with_prep = estimator_with_preprocessor.transform( - tuples_test[:, 0]) - output_without_prep = estimator_without_preprocessor.transform( - formed_tuples_test[:, 0]) - assert np.array(output_with_prep == output_without_prep).all() - - output_with_prep = estimator_with_preprocessor.transform( - tuples_test[:, 0]) - output_without_prep = estimator_with_prep_formed.transform( - formed_tuples_test[:, 0]) - assert np.array(output_with_prep == output_without_prep).all() - - -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) -def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): - # Adapted from scikit-learn - # check that fit method only changes or sets private attributes - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) - estimator = clone(estimator) - estimator.set_params(preprocessor=preprocessor) - if hasattr(estimator, "num_dims"): - estimator.num_dims = 1 - dict_before_fit = estimator.__dict__.copy() - - estimator.fit(tuples, y) - dict_after_fit = estimator.__dict__ - - public_keys_after_fit = [key for key in dict_after_fit.keys() - if is_public_parameter(key)] - - attrs_added_by_fit = [key for key in public_keys_after_fit - if key not in dict_before_fit.keys()] - - # check that fit doesn't add any public attribute - assert not attrs_added_by_fit, \ - ("Estimator adds public attribute(s) during" - " the fit method." - " Estimators are only allowed to add private " - "attributes" - " either started with _ or ended" - " with _ but %s added" % ', '.join(attrs_added_by_fit)) - - # check that fit doesn't change any public attribute - attrs_changed_by_fit = [key for key in public_keys_after_fit - if (dict_before_fit[key] - is not dict_after_fit[key])] - - assert not attrs_changed_by_fit, \ - ("Estimator changes public attribute(s) during" - " the fit method. Estimators are only allowed" - " to change attributes started" - " or ended with _, but" - " %s changed" % ', '.join(attrs_changed_by_fit)) - - -def _get_args(function, varargs=False): - """Helper to get function arguments""" - - try: - params = signature(function).parameters - except ValueError: - # Error on builtin C function - return [] - args = [key for key, param in params.items() - if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] - if varargs: - varargs = [param.name for param in params.values() - if param.kind == param.VAR_POSITIONAL] - if len(varargs) == 0: - varargs = None - return args, varargs - else: - return args From b29a555bbaca244fba9e1a6526afa5f5ddae06a8 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 15 Nov 2018 12:05:35 +0100 Subject: [PATCH 079/120] FIX: fix typo in LMNN shogun and clean todo for the equivalent code in python_LMNN --- metric_learn/lmnn.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 55f8fadf..4530d4ba 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -53,10 +53,8 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, class python_LMNN(_base_LMNN): def _process_inputs(self, X, labels): - self.X_, labels = self._prepare_inputs(X, labels, + self.X_, labels = self._prepare_inputs(X, labels, dtype=float, ensure_min_samples=2) - self.X_ = self.X_.astype(float) # todo: remove the conversion here and - # integrate it into check_input num_pts, num_dims = self.X_.shape unique_labels, self.label_inds_ = np.unique(labels, return_inverse=True) if len(self.label_inds_) != num_pts: @@ -260,7 +258,6 @@ class LMNN(_base_LMNN): def fit(self, X, y): self.X_, y = self._prepare_inputs(X, y, dtype=float) - self.X_ = self.X_, preprocessor=self.preprocessor_ labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) From 29be5e23b53ae67bce1962b0a01d3bafd00bc6b6 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 19 Nov 2018 17:33:12 +0100 Subject: [PATCH 080/120] MAINT: Rename inputs and input into input_data --- metric_learn/_util.py | 87 +++++++++++++++++----------------- test/test_mahalanobis_mixin.py | 34 ++++++------- test/test_sklearn_compat.py | 18 +++---- 3 files changed, 70 insertions(+), 69 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 326719a8..d7c61ae1 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -14,7 +14,7 @@ def vector_norm(X): return np.linalg.norm(X, axis=1) -def check_input(input, y=None, preprocessor=None, +def check_input(input_data, y=None, preprocessor=None, type_of_inputs='classic', t=None, accept_sparse=False, dtype="numeric", order=None, copy=False, force_all_finite=True, @@ -112,83 +112,83 @@ def check_input(input, y=None, preprocessor=None, ensure_min_features=ensure_min_features, warn_on_dtype=warn_on_dtype, estimator=estimator) if y is None: - input = check_array(input, ensure_2d=False, allow_nd=True, - copy=False, force_all_finite=False, - accept_sparse=True, dtype=None, - ensure_min_features=0, ensure_min_samples=0) + input_data = check_array(input_data, ensure_2d=False, allow_nd=True, + copy=False, force_all_finite=False, + accept_sparse=True, dtype=None, + ensure_min_features=0, ensure_min_samples=0) else: - input, y = check_X_y(input, y, ensure_2d=False, allow_nd=True, - copy=False, force_all_finite=False, - accept_sparse=True, dtype=None, - ensure_min_features=0, ensure_min_samples=0, - multi_output=multi_output, - y_numeric=y_numeric) + input_data, y = check_X_y(input_data, y, ensure_2d=False, allow_nd=True, + copy=False, force_all_finite=False, + accept_sparse=True, dtype=None, + ensure_min_features=0, ensure_min_samples=0, + multi_output=multi_output, + y_numeric=y_numeric) # we try to allow the more possible stuff here preprocessor_has_been_applied = False if type_of_inputs == 'classic': - if input.ndim == 1: + if input_data.ndim == 1: if preprocessor is not None: - input = preprocess_points(input, preprocessor) + input_data = preprocess_points(input_data, preprocessor) preprocessor_has_been_applied = True else: - make_error_input(101, input, context) - elif input.ndim == 2: + make_error_input(101, input_data, context) + elif input_data.ndim == 2: pass # OK else: if preprocessor is not None: - make_error_input(320, input, context) + make_error_input(320, input_data, context) else: - make_error_input(100, input, context) + make_error_input(100, input_data, context) - input = check_array(input, allow_nd=True, ensure_2d=False, - **args_for_sk_checks) - if input.ndim != 2: # we have to ensure this because check_array above - # does not + input_data = check_array(input_data, allow_nd=True, ensure_2d=False, + **args_for_sk_checks) + if input_data.ndim != 2: # we have to ensure this because check_array above + # does not if preprocessor_has_been_applied: - make_error_input(111, input, context) + make_error_input(111, input_data, context) else: - make_error_input(101, input, context) + make_error_input(101, input_data, context) elif type_of_inputs == 'tuples': - if input.ndim == 2: + if input_data.ndim == 2: if preprocessor is not None: - input = preprocess_tuples(input, preprocessor) + input_data = preprocess_tuples(input_data, preprocessor) preprocessor_has_been_applied = True else: - make_error_input(201, input, context) - elif input.ndim == 3: # we should check_num_features which is not checked - # after + make_error_input(201, input_data, context) + elif input_data.ndim == 3: # we should check_num_features which is not checked + # after pass else: if preprocessor is not None: - make_error_input(420, input, context) + make_error_input(420, input_data, context) else: - make_error_input(200, input, context) + make_error_input(200, input_data, context) - input = check_array(input, allow_nd=True, ensure_2d=False, - **args_for_sk_checks) + input_data = check_array(input_data, allow_nd=True, ensure_2d=False, + **args_for_sk_checks) if ensure_min_features > 0: - n_features = input.shape[2] + n_features = input_data.shape[2] if n_features < ensure_min_features: raise ValueError("Found array with {} feature(s) (shape={}) while" " a minimum of {} is required{}." - .format(n_features, input.shape, + .format(n_features, input_data.shape, ensure_min_features, context)) # normally we don't need to check_t too because t should'nt be able to # be modified by any preprocessor - if input.ndim != 3: # we have to ensure this because check_array above - # does not + if input_data.ndim != 3: # we have to ensure this because check_array + # above does not if preprocessor_has_been_applied: - make_error_input(211, input, context) + make_error_input(211, input_data, context) else: - make_error_input(201, input, context) - check_t(input, t, context) + make_error_input(201, input_data, context) + check_t(input_data, t, context) - return input if y is None else (input, y) + return input_data if y is None else (input_data, y) -def make_error_input(code, input, context): +def make_error_input(code, input_data, context): code_str = {'expected_input': {'1': '2D array of formed points', '2': '3D array of formed tuples', '3': ('1D array of indicators or 2D array of ' @@ -208,10 +208,11 @@ def make_error_input(code, input, context): [code_list[1]], possible_preprocessor=code_str['possible_preprocessor'] [code_list[2]], - input=input, context=context, found_size=input.ndim) + input_data=input_data, context=context, + found_size=input_data.ndim) err_msg = ('{expected_input} expected' '{context}{additional_context}. Found {found_size}D array ' - 'instead:\ninput={input}. Reshape your data' + 'instead:\ninput={input_data}. Reshape your data' '{possible_preprocessor}.\n') raise ValueError(err_msg.format(**err_args)) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index ee0c29e6..b8ebe0f8 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -76,13 +76,13 @@ def build_quadruplets(): ids=ids_estimators) def test_score_pairs_pairwise(estimator, build_dataset): # Computing pairwise scores should return a euclidean distance matrix. - inputs, labels = build_dataset() + input_data, labels = build_dataset() X, _ = load_iris(return_X_y=True) n_samples = 20 X = X[:n_samples] model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) pairwise = model.score_pairs(np.array(list(product(X, X))))\ .reshape(n_samples, n_samples) @@ -101,13 +101,13 @@ def test_score_pairs_pairwise(estimator, build_dataset): ids=ids_estimators) def test_score_pairs_toy_example(estimator, build_dataset): # Checks that score_pairs works on a toy example - inputs, labels = build_dataset() + input_data, labels = build_dataset() X, _ = load_iris(return_X_y=True) n_samples = 20 X = X[:n_samples] model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) pairs = np.stack([X[:10], X[10:20]], axis=1) embedded_pairs = pairs.dot(model.transformer_.T) distances = np.sqrt(np.sum((embedded_pairs[:, 1] - @@ -120,10 +120,10 @@ def test_score_pairs_toy_example(estimator, build_dataset): ids=ids_estimators) def test_score_pairs_finite(estimator, build_dataset): # tests that the score is finite - inputs, labels = build_dataset() + input_data, labels = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) X, _ = load_iris(return_X_y=True) pairs = np.array(list(product(X, X))) assert np.isfinite(model.score_pairs(pairs)).all() @@ -135,10 +135,10 @@ def test_score_pairs_dim(estimator, build_dataset): # scoring of 3D arrays should return 1D array (several tuples), # and scoring of 2D arrays (one tuple) should return an error (like # scikit-learn's error when scoring 1D arrays) - inputs, labels = build_dataset() + input_data, labels = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) X, _ = load_iris(return_X_y=True) tuples = np.array(list(product(X, X))) assert model.score_pairs(tuples).shape == (tuples.shape[0],) @@ -165,13 +165,13 @@ def check_is_distance_matrix(pairwise): ids=ids_estimators) def test_embed_toy_example(estimator, build_dataset): # Checks that embed works on a toy example - inputs, labels = build_dataset() + input_data, labels = build_dataset() X, _ = load_iris(return_X_y=True) n_samples = 20 X = X[:n_samples] model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) embedded_points = X.dot(model.transformer_.T) assert_array_almost_equal(model.transform(X), embedded_points) @@ -180,10 +180,10 @@ def test_embed_toy_example(estimator, build_dataset): ids=ids_estimators) def test_embed_dim(estimator, build_dataset): # Checks that the the dimension of the output space is as expected - inputs, labels = build_dataset() + input_data, labels = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) X, _ = load_iris(return_X_y=True) assert model.transform(X).shape == X.shape @@ -198,7 +198,7 @@ def test_embed_dim(estimator, build_dataset): # we test that the shape is also OK when doing dimensionality reduction if type(model).__name__ in {'LFDA', 'MLKR', 'NCA', 'RCA'}: model.set_params(num_dims=2) - model.fit(inputs, labels) + model.fit(input_data, labels) assert model.transform(X).shape == (X.shape[0], 2) # assert that ValueError is thrown if input shape is 1D with pytest.raises(ValueError) as raised_error: @@ -210,10 +210,10 @@ def test_embed_dim(estimator, build_dataset): ids=ids_estimators) def test_embed_finite(estimator, build_dataset): # Checks that embed returns vectors with finite values - inputs, labels = build_dataset() + input_data, labels = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) X, _ = load_iris(return_X_y=True) assert np.isfinite(model.transform(X)).all() @@ -222,10 +222,10 @@ def test_embed_finite(estimator, build_dataset): ids=ids_estimators) def test_embed_is_linear(estimator, build_dataset): # Checks that the embedding is linear - inputs, labels = build_dataset() + input_data, labels = build_dataset() model = clone(estimator) set_random_state(model) - model.fit(inputs, labels) + model.fit(input_data, labels) X, _ = load_iris(return_X_y=True) assert_array_almost_equal(model.transform(X[:10] + X[10:20]), model.transform(X[:10]) + diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 946db78c..da6a30c8 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -290,7 +290,7 @@ def test_estimators_fit_returns_self(estimator, build_dataset, preprocessor): def test_pipeline_consistency(estimator, build_dataset, preprocessor): # Adapted from scikit learn # check that make_pipeline(est) gives same score as est - (_, inputs, y, _, _, _, _, preprocessor) = build_dataset(preprocessor) + (_, input_data, y, _, _, _, _, preprocessor) = build_dataset(preprocessor) def make_random_state(estimator, in_pipeline): rs = {} @@ -305,23 +305,23 @@ def make_random_state(estimator, in_pipeline): estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) pipeline = make_pipeline(estimator) - estimator.fit(inputs, y, **make_random_state(estimator, False)) - pipeline.fit(inputs, y, **make_random_state(estimator, True)) + estimator.fit(input_data, y, **make_random_state(estimator, False)) + pipeline.fit(input_data, y, **make_random_state(estimator, True)) if hasattr(estimator, 'score'): - result = estimator.score(inputs, y) - result_pipe = pipeline.score(inputs, y) + result = estimator.score(input_data, y) + result_pipe = pipeline.score(input_data, y) assert_allclose_dense_sparse(result, result_pipe) if hasattr(estimator, 'predict'): - result = estimator.predict(inputs) - result_pipe = pipeline.predict(inputs) + result = estimator.predict(input_data) + result_pipe = pipeline.predict(input_data) assert_allclose_dense_sparse(result, result_pipe) if issubclass(estimator.__class__, TransformerMixin): if hasattr(estimator, 'transform'): - result = estimator.transform(inputs) - result_pipe = pipeline.transform(inputs) + result = estimator.transform(input_data) + result_pipe = pipeline.transform(input_data) assert_allclose_dense_sparse(result, result_pipe) From 9d849f7feb3fece3564e4ff3b45bb979ce551c4a Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 09:08:08 +0100 Subject: [PATCH 081/120] STY: add backticks to None --- metric_learn/_util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index d7c61ae1..a50f99a9 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -36,7 +36,7 @@ def check_input(input_data, y=None, preprocessor=None, y : array-like The input labels array to check. - preprocessor : callable (default=None) + preprocessor : callable (default=`None`) The preprocessor to use. If None, no preprocessor is used. type_of_inputs : `str` {'classic', 'tuples'} @@ -60,7 +60,7 @@ def check_input(input_data, y=None, preprocessor=None, "auto", will we be set to "numeric" if `preprocessor=True`, else to None. - order : 'F', 'C' or None (default=None) + order : 'F', 'C' or None (default=`None`) Whether an array will be forced to be fortran or c-style. copy : boolean (default=False) @@ -91,7 +91,7 @@ def check_input(input_data, y=None, preprocessor=None, Raise DataConversionWarning if the dtype of the input data structure does not match the requested dtype, causing a memory copy. - estimator : str or estimator instance (default=None) + estimator : str or estimator instance (default=`None`) If passed, include the name of the estimator in warning messages. Returns From d9ba29e49ee37915db7fc3630cd9888e0aa2e59b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 09:15:23 +0100 Subject: [PATCH 082/120] MAINT: add more detailed comment of first checks and remove old comment --- metric_learn/_util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index a50f99a9..60dd2f1a 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -111,6 +111,10 @@ def check_input(input_data, y=None, preprocessor=None, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, warn_on_dtype=warn_on_dtype, estimator=estimator) + + # We need to convert input_data into a numpy.ndarray if possible, before + # any further checks or conversions, and deal with y if needed. Therefore + # we use check_array/check_X_y with fixed permissive arguments. if y is None: input_data = check_array(input_data, ensure_2d=False, allow_nd=True, copy=False, force_all_finite=False, @@ -123,7 +127,6 @@ def check_input(input_data, y=None, preprocessor=None, ensure_min_features=0, ensure_min_samples=0, multi_output=multi_output, y_numeric=y_numeric) - # we try to allow the more possible stuff here preprocessor_has_been_applied = False if type_of_inputs == 'classic': From c1dcc1fe3a3605317f9c67e1fd98b2f189897d3e Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 09:30:48 +0100 Subject: [PATCH 083/120] MAINT: improve comments for checking num_features --- metric_learn/_util.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 60dd2f1a..0d12968e 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -146,8 +146,8 @@ def check_input(input_data, y=None, preprocessor=None, input_data = check_array(input_data, allow_nd=True, ensure_2d=False, **args_for_sk_checks) - if input_data.ndim != 2: # we have to ensure this because check_array above - # does not + if input_data.ndim != 2: # we have to ensure this because check_array + # above does not if preprocessor_has_been_applied: make_error_input(111, input_data, context) else: @@ -160,8 +160,7 @@ def check_input(input_data, y=None, preprocessor=None, preprocessor_has_been_applied = True else: make_error_input(201, input_data, context) - elif input_data.ndim == 3: # we should check_num_features which is not checked - # after + elif input_data.ndim == 3: pass else: if preprocessor is not None: @@ -171,6 +170,8 @@ def check_input(input_data, y=None, preprocessor=None, input_data = check_array(input_data, allow_nd=True, ensure_2d=False, **args_for_sk_checks) + # we need to check num_features because check_array does not check it + # for 3D inputs: if ensure_min_features > 0: n_features = input_data.shape[2] if n_features < ensure_min_features: From 6ccd25de9245ba5b050ca93263c1e30074716a06 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 10:06:49 +0100 Subject: [PATCH 084/120] MAINT: Refactor t into tuple_size --- metric_learn/_util.py | 6 +++--- metric_learn/base_metric.py | 12 ++++++------ test/test_utils.py | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 0d12968e..3b80d590 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -15,7 +15,7 @@ def vector_norm(X): def check_input(input_data, y=None, preprocessor=None, - type_of_inputs='classic', t=None, accept_sparse=False, + type_of_inputs='classic', tuple_size=None, accept_sparse=False, dtype="numeric", order=None, copy=False, force_all_finite=True, multi_output=False, ensure_min_samples=1, @@ -49,7 +49,7 @@ def check_input(input_data, y=None, preprocessor=None, Set to true to allow sparse inputs (only works for sparse inputs with dim < 3). - t : int + tuple_size : int The number of elements in a tuple (e.g. 2 for pairs). dtype : string, type, list of types or None (default="auto") @@ -187,7 +187,7 @@ def check_input(input_data, y=None, preprocessor=None, make_error_input(211, input_data, context) else: make_error_input(201, input_data, context) - check_t(input_data, t, context) + check_t(input_data, tuple_size, context) return input_data if y is None else (input_data, y) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 227cf6e2..79da37f3 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -79,7 +79,7 @@ def _prepare_inputs(self, X, y=None, type_of_inputs='classic', type_of_inputs=type_of_inputs, preprocessor=self.preprocessor_, estimator=self, - t=self._t if hasattr(self, '_t') else None, + tuple_size=self._t if hasattr(self, '_t') else None, **kwargs) @@ -146,7 +146,7 @@ def score_pairs(self, pairs): """ pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, - estimator=self, t=2) + estimator=self, tuple_size=2) pairwise_diffs = self.transform(pairs[:, 1, :] - pairs[:, 0, :]) # (for MahalanobisMixin, the embedding is linear so we can just embed the # difference) @@ -206,7 +206,7 @@ def transformer_from_metric(self, metric): class _PairsClassifierMixin(BaseMetricLearner): - _t = 2 # number of points in a tuple, 2 for pairs + _tuple_size = 2 # number of points in a tuple, 2 for pairs def predict(self, pairs): """Predicts the learned metric between input pairs. @@ -228,7 +228,7 @@ def predict(self, pairs): """ pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, - estimator=self, t=self._t) + estimator=self, tuple_size=self._tuple_size) return self.score_pairs(pairs) def decision_function(self, pairs): @@ -264,7 +264,7 @@ def score(self, pairs, y): class _QuadrupletsClassifierMixin(BaseMetricLearner): - _t = 4 # number of points in a tuple, 4 for quadruplets + _tuple_size = 4 # number of points in a tuple, 4 for quadruplets def predict(self, quadruplets): """Predicts the ordering between sample distances in input quadruplets. @@ -287,7 +287,7 @@ def predict(self, quadruplets): """ quadruplets = check_input(quadruplets, type_of_inputs='tuples', preprocessor=self.preprocessor_, - estimator=self, t=self._t) + estimator=self, tuple_size=self._tuple_size) # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices return np.sign(self.decision_function(quadruplets)) diff --git a/test/test_utils.py b/test/test_utils.py index 6fade133..f1f1cc86 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -70,7 +70,7 @@ def test_check_tuples_invalid_t(estimator, context, load_tuples, preprocessor): .format(context, preprocessed_tuples.shape, preprocessed_tuples)) with pytest.raises(ValueError) as raised_error: - check_input(tuples, type_of_inputs='tuples', t=3, + check_input(tuples, type_of_inputs='tuples', tuple_size=3, preprocessor=preprocessor, estimator=estimator) assert str(raised_error.value) == expected_msg @@ -210,9 +210,9 @@ def test_check_tuples_valid_t(t, tuples_prep, tuples_no_prep): """ with pytest.warns(None) as record: check_input(tuples_prep, type_of_inputs='tuples', - preprocessor=mock_preprocessor, t=t) + preprocessor=mock_preprocessor, tuple_size=t) check_input(tuples_no_prep, type_of_inputs='tuples', preprocessor=None, - t=t) + tuple_size=t) assert len(record) == 0 From 2d96edae317c8c9439ff022b90bd8cd6a2d559a5 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 10:18:19 +0100 Subject: [PATCH 085/120] MAINT: Fix small PEP8 error --- metric_learn/_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 3b80d590..904c7b19 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -224,7 +224,7 @@ def make_error_input(code, input_data, context): def preprocess_tuples(tuples, preprocessor): print("Preprocessing tuples...") tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for - i in range(tuples.shape[1])]) + i in range(tuples.shape[1])]) return tuples From 4b50ff1e6240a7b66254e65a7129ffbbacc763b1 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 10:46:53 +0100 Subject: [PATCH 086/120] MAINT: FIX remaining t into tuple_size and replace hasattr if None by getattr with default None --- metric_learn/_util.py | 17 +++++++++-------- metric_learn/base_metric.py | 2 +- metric_learn/itml.py | 2 +- metric_learn/lsml.py | 2 +- metric_learn/mmc.py | 2 +- metric_learn/sdml.py | 2 +- test/test_utils.py | 18 ++++++++++-------- 7 files changed, 24 insertions(+), 21 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 904c7b19..f47560bc 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -23,7 +23,7 @@ def check_input(input_data, y=None, preprocessor=None, warn_on_dtype=False, estimator=None): """Checks that the input format is valid, and converts it if specified (this is the equivalent of scikit-learn's `check_array` or `check_X_y`). - All arguments following t are scikit-learn's `check_X_y` + All arguments following tuple_size are scikit-learn's `check_X_y` arguments that will be enforced on the data and labels array. If indicators are given as an input data array, the returned data array will be the formed points/tuples, using the given preprocessor. @@ -179,15 +179,15 @@ def check_input(input_data, y=None, preprocessor=None, " a minimum of {} is required{}." .format(n_features, input_data.shape, ensure_min_features, context)) - # normally we don't need to check_t too because t should'nt be able to - # be modified by any preprocessor + # normally we don't need to check_tuple_size too because tuple_size + # should'nt be able to be modified by any preprocessor if input_data.ndim != 3: # we have to ensure this because check_array # above does not if preprocessor_has_been_applied: make_error_input(211, input_data, context) else: make_error_input(201, input_data, context) - check_t(input_data, tuple_size, context) + check_tuple_size(input_data, tuple_size, context) return input_data if y is None else (input_data, y) @@ -263,13 +263,14 @@ def make_name(estimator): return estimator_name -def check_t(tuples, t, context): +def check_tuple_size(tuples, tuple_size, context): """Helper function to check that the number of points in each tuple is - equal to t (e.g. 2 for pairs), and raise a `ValueError` otherwise""" - if t is not None and tuples.shape[1] != t: + equal to tuple_size (e.g. 2 for pairs), and raise a `ValueError` otherwise""" + if tuple_size is not None and tuples.shape[1] != tuple_size: msg_t = (("Tuples of {} element(s) expected{}. Got tuples of {} " "element(s) instead (shape={}):\ninput={}.\n") - .format(t, context, tuples.shape[1], tuples.shape, tuples)) + .format(tuple_size, context, tuples.shape[1], tuples.shape, + tuples)) raise ValueError(msg_t) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 79da37f3..049e3b6a 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -79,7 +79,7 @@ def _prepare_inputs(self, X, y=None, type_of_inputs='classic', type_of_inputs=type_of_inputs, preprocessor=self.preprocessor_, estimator=self, - tuple_size=self._t if hasattr(self, '_t') else None, + tuple_size=getattr(self, '_tuple_size', None), **kwargs) diff --git a/metric_learn/itml.py b/metric_learn/itml.py index 990b3c78..c450ee71 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -27,7 +27,7 @@ class _BaseITML(MahalanobisMixin): """Information Theoretic Metric Learning (ITML)""" - _t = 2 # constraints are pairs + _tuple_size = 2 # constraints are pairs def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, A0=None, verbose=False, preprocessor=None): diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 65b7b2a8..11147965 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -19,7 +19,7 @@ class _BaseLSML(MahalanobisMixin): - _t = 4 # constraints are quadruplets + _tuple_size = 4 # constraints are quadruplets def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, preprocessor=None): diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 5ee11fdb..937baaad 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -30,7 +30,7 @@ class _BaseMMC(MahalanobisMixin): """Mahalanobis Metric for Clustering (MMC)""" - _t = 2 # constraints are pairs + _tuple_size = 2 # constraints are pairs def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, A0=None, diagonal=False, diagonal_c=1.0, verbose=False, diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 651b3aa4..d55f01fe 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -20,7 +20,7 @@ class _BaseSDML(MahalanobisMixin): - _t = 2 # constraints are pairs + _tuple_size = 2 # constraints are pairs def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, verbose=False, preprocessor=None): diff --git a/test/test_utils.py b/test/test_utils.py index f1f1cc86..96c96e0d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -59,8 +59,10 @@ def test_make_name(estimator, expected): [(tuples_prep, mock_preprocessor), (tuples_no_prep, None), (tuples_no_prep, mock_preprocessor)]) -def test_check_tuples_invalid_t(estimator, context, load_tuples, preprocessor): - """Checks that the exception are raised if t is not the one expected""" +def test_check_tuples_invalid_tuple_size(estimator, context, load_tuples, + preprocessor): + """Checks that the exception are raised if tuple_size is not the one + expected""" tuples = load_tuples() preprocessed_tuples = (preprocess_tuples(tuples, preprocessor) if (preprocessor is not None and @@ -202,17 +204,17 @@ def test_check_tuples_invalid_dtype_not_convertible_without_preprocessor( preprocessor=None, dtype=np.float64) -@pytest.mark.parametrize('t', [2, None]) -def test_check_tuples_valid_t(t, tuples_prep, tuples_no_prep): +@pytest.mark.parametrize('tuple_size', [2, None]) +def test_check_tuples_valid_tuple_size(tuple_size, tuples_prep, tuples_no_prep): """For inputs that have the right matrix dimension (2D or 3D for instance), checks that checking the number of tuples (pairs, quadruplets, etc) raises no warning if there is the right number of points in a tuple. """ with pytest.warns(None) as record: check_input(tuples_prep, type_of_inputs='tuples', - preprocessor=mock_preprocessor, tuple_size=t) + preprocessor=mock_preprocessor, tuple_size=tuple_size) check_input(tuples_no_prep, type_of_inputs='tuples', preprocessor=None, - tuple_size=t) + tuple_size=tuple_size) assert len(record) == 0 @@ -661,7 +663,7 @@ def fun(row): @pytest.mark.parametrize('estimator', [ITML(), LSML(), MMC(), SDML()], ids=['ITML', 'LSML', 'MMC', 'SDML']) -def test_error_message_t(estimator): +def test_error_message_tuple_size(estimator): """Tests that if a tuples learner is not given the good number of points per tuple, it throws an error message""" estimator = clone(estimator) @@ -673,7 +675,7 @@ def test_error_message_t(estimator): estimator.fit(invalid_pairs, y) expected_msg = ("Tuples of {} element(s) expected{}. Got tuples of 3 " "element(s) instead (shape=(2, 3, 2)):\ninput={}.\n" - .format(estimator._t, make_context(estimator), + .format(estimator._tuple_size, make_context(estimator), invalid_pairs)) assert str(raised_err.value) == expected_msg From f69f135f095bf943f131edb0b9b6e64b57932d20 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 10:49:52 +0100 Subject: [PATCH 087/120] MAINT: remove misplaced comment --- metric_learn/base_metric.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 049e3b6a..808511b8 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -288,8 +288,6 @@ def predict(self, quadruplets): quadruplets = check_input(quadruplets, type_of_inputs='tuples', preprocessor=self.preprocessor_, estimator=self, tuple_size=self._tuple_size) - # we broadcast with ... because here we allow quadruplets to be - # either a 3D array of points or 2D array of indices return np.sign(self.decision_function(quadruplets)) def decision_function(self, quadruplets): From 192d208adb368e3a35e139067fd5176023c47b97 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 11:14:12 +0100 Subject: [PATCH 088/120] MAINT: Put back/add docstrings for decision_function/predict --- metric_learn/base_metric.py | 41 +++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 808511b8..50f88c59 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -209,7 +209,28 @@ class _PairsClassifierMixin(BaseMetricLearner): _tuple_size = 2 # number of points in a tuple, 2 for pairs def predict(self, pairs): - """Predicts the learned metric between input pairs. + """Predicts the learned metric between input pairs. (For now it just + calls decision function). + + Returns the learned metric value between samples in every pair. It should + ideally be low for similar samples and high for dissimilar samples. + + Parameters + ---------- + pairs : array-like, shape=(n_pairs, 2, n_features) or (n_pairs, 2) + 3D Array of pairs to predict, with each row corresponding to two + points, or 2D array of indices of pairs if the metric learner uses a + preprocessor. + + Returns + ------- + y_predicted : `numpy.ndarray` of floats, shape=(n_constraints,) + The predicted learned metric value between samples in every pair. + """ + return self.decision_function(pairs) + + def decision_function(self, pairs): + """Returns the learned metric between input pairs. Returns the learned metric value between samples in every pair. It should ideally be low for similar samples and high for dissimilar samples. @@ -231,9 +252,6 @@ def predict(self, pairs): estimator=self, tuple_size=self._tuple_size) return self.score_pairs(pairs) - def decision_function(self, pairs): - return self.predict(pairs) - def score(self, pairs, y): """Computes score of pairs similarity prediction. @@ -291,6 +309,21 @@ def predict(self, quadruplets): return np.sign(self.decision_function(quadruplets)) def decision_function(self, quadruplets): + """Predicts differences between sample distances in input quadruplets. + + For each quadruplet of samples, computes the difference between the learned + metric of the first pair minus the learned metric of the second pair. + + Parameters + ---------- + quadruplets : array-like, shape=(n_constraints, 4, n_features) + Input quadruplets. + + Returns + ------- + decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) + Metric differences. + """ # we broadcast with ... because here we allow quadruplets to be # either a 3D array of points or 2D array of indices return (self.score_pairs(quadruplets[:, :2, ...]) - From 2c09d9a9dbbd97020702f30c7f54698d17e119cf Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 11:19:56 +0100 Subject: [PATCH 089/120] MAINT: remove unnecessary ellipsis and upadate docstring of decision_function --- metric_learn/base_metric.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 50f88c59..a4214bd8 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -316,18 +316,19 @@ def decision_function(self, quadruplets): Parameters ---------- - quadruplets : array-like, shape=(n_constraints, 4, n_features) - Input quadruplets. + quadruplets : array-like, shape=(n_quadruplets, 4, n_features) or + (n_quadruplets, 4) + 3D Array of quadruplets to predict, with each row corresponding to four + points, or 2D array of indices of quadruplets if the metric learner + uses a preprocessor. Returns ------- decision_function : `numpy.ndarray` of floats, shape=(n_constraints,) Metric differences. """ - # we broadcast with ... because here we allow quadruplets to be - # either a 3D array of points or 2D array of indices - return (self.score_pairs(quadruplets[:, :2, ...]) - - self.score_pairs(quadruplets[:, 2:, ...])) + return (self.score_pairs(quadruplets[:, :2]) - + self.score_pairs(quadruplets[:, 2:])) def score(self, quadruplets, y=None): """Computes score on input quadruplets From 1b7e55f2627c11e959fa2e6c83d6289820d2b233 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 12:12:19 +0100 Subject: [PATCH 090/120] Add comments in LMNN for arguments useful for the shogun version that are not used in the python version --- metric_learn/lmnn.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 4530d4ba..5a465a37 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -52,7 +52,10 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, # slower Python version class python_LMNN(_base_LMNN): - def _process_inputs(self, X, labels): + def _check_inputs_params_compatibility(self, X, labels): + """Process inputs and raise appropriate error messages if input + parameters are not those expected with respect to inputs. + """ self.X_, labels = self._prepare_inputs(X, labels, dtype=float, ensure_min_samples=2) num_pts, num_dims = self.X_.shape @@ -72,7 +75,7 @@ def fit(self, X, y): k = self.k reg = self.regularization learn_rate = self.learn_rate - self._process_inputs(X, y) + self._check_inputs_params_compatibility(X, y) target_neighbors = self._select_targets() impostors = self._find_impostors(target_neighbors[:,-1]) From 8b600bee225c7c962425ce832352fb3c28c01be6 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 17:23:09 +0100 Subject: [PATCH 091/120] MAINT: Remove useless mock_preprocessor --- test/test_sklearn_compat.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index da6a30c8..6aa61e59 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -84,12 +84,6 @@ def test_mmc(self): RNG = check_random_state(0) -def mock_preprocessor(indices): - """A preprocessor for testing purposes that returns an all ones 3D array - """ - return np.ones((indices.shape[0], 3)) - - # ---------------------- Test scikit-learn compatibility ---------------------- From e4468d047b159c2a192264c5e2cb8589e2b2ddfa Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 17:29:01 +0100 Subject: [PATCH 092/120] MAINT: Remove useless loop --- test/test_sklearn_compat.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 6aa61e59..294b79ea 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -338,14 +338,13 @@ def test_dict_unchanged(estimator, build_dataset, preprocessor): assert estimator.__dict__ == dict_before, \ ("Estimator changes __dict__ during %s" % method) - for method in ["transform"]: - if hasattr(estimator, method): - dict_before = estimator.__dict__.copy() - # we transform only 2D arrays (dataset of points) - getattr(estimator, method)(X) - assert estimator.__dict__ == dict_before, \ - ("Estimator changes __dict__ during %s" - % method) + if hasattr(estimator, "transform"): + dict_before = estimator.__dict__.copy() + # we transform only 2D arrays (dataset of points) + estimator.transform(X) + assert estimator.__dict__ == dict_before, \ + ("Estimator changes __dict__ during %s" + % method) @pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) From a1c95fa8cb57cc6286ab760bd82ffbfd438c45dc Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 17:46:25 +0100 Subject: [PATCH 093/120] MAINT: refactor test_dict_unchanged --- test/test_sklearn_compat.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 294b79ea..d9bfe235 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -331,20 +331,20 @@ def test_dict_unchanged(estimator, build_dataset, preprocessor): if hasattr(estimator, "num_dims"): estimator.num_dims = 1 estimator.fit(tuples, y) + + def check_dict(): + assert estimator.__dict__ == dict_before, ( + "Estimator changes __dict__ during %s" % method) for method in ["predict", "decision_function", "predict_proba"]: if hasattr(estimator, method): dict_before = estimator.__dict__.copy() getattr(estimator, method)(tuples) - assert estimator.__dict__ == dict_before, \ - ("Estimator changes __dict__ during %s" - % method) + check_dict() if hasattr(estimator, "transform"): dict_before = estimator.__dict__.copy() # we transform only 2D arrays (dataset of points) estimator.transform(X) - assert estimator.__dict__ == dict_before, \ - ("Estimator changes __dict__ during %s" - % method) + check_dict() @pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) From d95b22a3d4500959285dbc564fc379257a2b1675 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 20 Nov 2018 18:00:34 +0100 Subject: [PATCH 094/120] MAINT: remove _get_args copied from scikit-learn and replace it by an import --- test/test_sklearn_compat.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index d9bfe235..eb542567 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -18,6 +18,7 @@ import numpy as np from sklearn.model_selection import (cross_val_score, cross_val_predict, train_test_split) +from sklearn.utils.testing import _get_args # Wrap the _Supervised methods with a deterministic wrapper for testing. @@ -392,25 +393,5 @@ def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): " %s changed" % ', '.join(attrs_changed_by_fit)) -def _get_args(function, varargs=False): - """Helper to get function arguments""" - - try: - params = signature(function).parameters - except ValueError: - # Error on builtin C function - return [] - args = [key for key, param in params.items() - if param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD)] - if varargs: - varargs = [param.name for param in params.values() - if param.kind == param.VAR_POSITIONAL] - if len(varargs) == 0: - varargs = None - return args, varargs - else: - return args - - if __name__ == '__main__': unittest.main() From bc06f8fba2a5cdbd61d7a3999115b2e32cd0c2f9 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 21 Nov 2018 11:30:39 +0100 Subject: [PATCH 095/120] MAINT: Fragment check_input by extracting blocks into check_input_classic and check_input_tuples --- metric_learn/_util.py | 128 +++++++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 57 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index f47560bc..1b79ddd1 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -127,71 +127,85 @@ def check_input(input_data, y=None, preprocessor=None, ensure_min_features=0, ensure_min_samples=0, multi_output=multi_output, y_numeric=y_numeric) - preprocessor_has_been_applied = False if type_of_inputs == 'classic': - if input_data.ndim == 1: - if preprocessor is not None: - input_data = preprocess_points(input_data, preprocessor) - preprocessor_has_been_applied = True - else: - make_error_input(101, input_data, context) - elif input_data.ndim == 2: - pass # OK - else: - if preprocessor is not None: - make_error_input(320, input_data, context) - else: - make_error_input(100, input_data, context) - - input_data = check_array(input_data, allow_nd=True, ensure_2d=False, - **args_for_sk_checks) - if input_data.ndim != 2: # we have to ensure this because check_array - # above does not - if preprocessor_has_been_applied: - make_error_input(111, input_data, context) - else: - make_error_input(101, input_data, context) + input_data = check_input_classic(input_data, context, preprocessor, + args_for_sk_checks) elif type_of_inputs == 'tuples': - if input_data.ndim == 2: - if preprocessor is not None: - input_data = preprocess_tuples(input_data, preprocessor) - preprocessor_has_been_applied = True - else: - make_error_input(201, input_data, context) - elif input_data.ndim == 3: - pass - else: - if preprocessor is not None: - make_error_input(420, input_data, context) - else: - make_error_input(200, input_data, context) - - input_data = check_array(input_data, allow_nd=True, ensure_2d=False, - **args_for_sk_checks) - # we need to check num_features because check_array does not check it - # for 3D inputs: - if ensure_min_features > 0: - n_features = input_data.shape[2] - if n_features < ensure_min_features: - raise ValueError("Found array with {} feature(s) (shape={}) while" - " a minimum of {} is required{}." - .format(n_features, input_data.shape, - ensure_min_features, context)) - # normally we don't need to check_tuple_size too because tuple_size - # should'nt be able to be modified by any preprocessor - if input_data.ndim != 3: # we have to ensure this because check_array - # above does not - if preprocessor_has_been_applied: - make_error_input(211, input_data, context) - else: - make_error_input(201, input_data, context) - check_tuple_size(input_data, tuple_size, context) + input_data = check_input_tuples(input_data, context, preprocessor, + args_for_sk_checks, tuple_size) return input_data if y is None else (input_data, y) +def check_input_tuples(input_data, context, preprocessor, args_for_sk_checks, + tuple_size): + preprocessor_has_been_applied = False + if input_data.ndim == 2: + if preprocessor is not None: + input_data = preprocess_tuples(input_data, preprocessor) + preprocessor_has_been_applied = True + else: + make_error_input(201, input_data, context) + elif input_data.ndim == 3: + pass + else: + if preprocessor is not None: + make_error_input(420, input_data, context) + else: + make_error_input(200, input_data, context) + input_data = check_array(input_data, allow_nd=True, ensure_2d=False, + **args_for_sk_checks) + # we need to check num_features because check_array does not check it + # for 3D inputs: + if args_for_sk_checks['ensure_min_features'] > 0: + n_features = input_data.shape[2] + if n_features < args_for_sk_checks['ensure_min_features']: + raise ValueError("Found array with {} feature(s) (shape={}) while" + " a minimum of {} is required{}." + .format(n_features, input_data.shape, + args_for_sk_checks['ensure_min_features'], + context)) + # normally we don't need to check_tuple_size too because tuple_size + # should'nt be able to be modified by any preprocessor + if input_data.ndim != 3: # we have to ensure this because check_array + # above does not + if preprocessor_has_been_applied: + make_error_input(211, input_data, context) + else: + make_error_input(201, input_data, context) + check_tuple_size(input_data, tuple_size, context) + return input_data + + +def check_input_classic(input_data, context, preprocessor, args_for_sk_checks): + preprocessor_has_been_applied = False + if input_data.ndim == 1: + if preprocessor is not None: + input_data = preprocess_points(input_data, preprocessor) + preprocessor_has_been_applied = True + else: + make_error_input(101, input_data, context) + elif input_data.ndim == 2: + pass # OK + else: + if preprocessor is not None: + make_error_input(320, input_data, context) + else: + make_error_input(100, input_data, context) + + input_data = check_array(input_data, allow_nd=True, ensure_2d=False, + **args_for_sk_checks) + if input_data.ndim != 2: # we have to ensure this because check_array + # above does not + if preprocessor_has_been_applied: + make_error_input(111, input_data, context) + else: + make_error_input(101, input_data, context) + return input_data + + def make_error_input(code, input_data, context): code_str = {'expected_input': {'1': '2D array of formed points', '2': '3D array of formed tuples', From 9260a8e8c33f377a88c111acfd416323c2a8d7ca Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 21 Nov 2018 17:41:02 +0100 Subject: [PATCH 096/120] MAINT: ensure min_samples=2 for supervised learning algorithms (we should at least have two datapoints to learn something) --- metric_learn/lfda.py | 2 +- metric_learn/lmnn.py | 3 ++- metric_learn/mlkr.py | 3 ++- metric_learn/nca.py | 2 +- metric_learn/rca.py | 4 ++-- metric_learn/sdml.py | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index 4e485f39..bd6e57aa 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -64,7 +64,7 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted', def _process_inputs(self, X, y): unique_classes, y = np.unique(y, return_inverse=True) - self.X_, y = self._prepare_inputs(X, y) + self.X_, y = self._prepare_inputs(X, y, ensure_min_samples=2) n, d = self.X_.shape num_classes = len(unique_classes) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 5a465a37..eb3f129c 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -260,7 +260,8 @@ class LMNN(_base_LMNN): """ def fit(self, X, y): - self.X_, y = self._prepare_inputs(X, y, dtype=float) + self.X_, y = self._prepare_inputs(X, y, dtype=float, + ensure_min_samples=2) labels = MulticlassLabels(y) self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 0a92c49a..5dc57026 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -62,7 +62,8 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, super(MLKR, self).__init__(preprocessor) def _process_inputs(self, X, y): - self.X_, y = self._prepare_inputs(X, y, y_numeric=True) + self.X_, y = self._prepare_inputs(X, y, y_numeric=True, + ensure_min_samples=2) n, d = self.X_.shape if y.shape[0] != n: raise ValueError('Data and label lengths mismatch: %d != %d' diff --git a/metric_learn/nca.py b/metric_learn/nca.py index 94bb9bfe..791617c5 100644 --- a/metric_learn/nca.py +++ b/metric_learn/nca.py @@ -34,7 +34,7 @@ def fit(self, X, y): X: data matrix, (n x d) y: scalar labels, (n) """ - X, labels = self._prepare_inputs(X, y) + X, labels = self._prepare_inputs(X, y, ensure_min_samples=2) n, d = X.shape num_dims = self.num_dims if num_dims is None: diff --git a/metric_learn/rca.py b/metric_learn/rca.py index f2e3937c..32a4ceee 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -68,7 +68,7 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): super(RCA, self).__init__(preprocessor) def _process_data(self, X): - X = self._prepare_inputs(X) + X = self._prepare_inputs(X, ensure_min_samples=2) # PCA projection to remove noise and redundant information. if self.pca_comps is not None: @@ -183,7 +183,7 @@ def fit(self, X, y, random_state=np.random): y : (n) data labels random_state : a random.seed object to fix the random_state if needed. """ - X, y = self._prepare_inputs(X, y) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) chunks = Constraints(y).chunks(num_chunks=self.num_chunks, chunk_size=self.chunk_size, random_state=random_state) diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index d55f01fe..dca083d0 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -161,7 +161,7 @@ def fit(self, X, y, random_state=np.random): self : object Returns the instance. """ - X, y = self._prepare_inputs(X, y) + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) num_constraints = self.num_constraints if num_constraints is None: num_classes = len(np.unique(y)) From 93c3e341134f1a4c7a29f32e746c453b9195b3a3 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 27 Nov 2018 11:28:57 +0100 Subject: [PATCH 097/120] ENH: Return custom error when some error is due to the preprocessor --- metric_learn/_util.py | 13 ++++++++++--- metric_learn/exceptions.py | 12 ++++++++++++ test/test_utils.py | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 metric_learn/exceptions.py diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 1b79ddd1..cff70cfd 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -2,6 +2,7 @@ import six from sklearn.utils import check_array from sklearn.utils.validation import check_X_y +from metric_learn.exceptions import PreprocessorError # hack around lack of axis kwarg in older numpy versions try: @@ -237,8 +238,11 @@ def make_error_input(code, input_data, context): def preprocess_tuples(tuples, preprocessor): print("Preprocessing tuples...") - tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for - i in range(tuples.shape[1])]) + try: + tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for + i in range(tuples.shape[1])]) + except Exception as e: + raise PreprocessorError(e) return tuples @@ -246,7 +250,10 @@ def preprocess_points(points, preprocessor): """form points if there is a preprocessor else keep them as such (assumes that check_points has already been called)""" print("Preprocessing points...") - points = preprocessor(points) + try: + points = preprocessor(points) + except Exception as e: + raise PreprocessorError(e) return points diff --git a/metric_learn/exceptions.py b/metric_learn/exceptions.py new file mode 100644 index 00000000..424d2c4f --- /dev/null +++ b/metric_learn/exceptions.py @@ -0,0 +1,12 @@ +""" +The :mod:`metric_learn.exceptions` module includes all custom warnings and +error classes used across metric-learn. +""" + + +class PreprocessorError(Exception): + + def __init__(self, original_error): + err_msg = ("An error occurred when trying to use the " + "preprocessor: {}").format(repr(original_error)) + super(PreprocessorError, self).__init__(err_msg) diff --git a/test/test_utils.py b/test/test_utils.py index 96c96e0d..78f93111 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -11,6 +11,8 @@ from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised) +from metric_learn.base_metric import ArrayIndexer +from metric_learn.exceptions import PreprocessorError from sklearn.datasets import make_regression, make_blobs from .test_sklearn_compat import build_pairs, build_quadruplets @@ -661,6 +663,24 @@ def fun(row): assert out == "Preprocessing tuples...\n" +def test_preprocessor_error_message(): + """Tests whether the preprocessor returns a preprocessor error when there + is a problem using the preprocessor + """ + preprocessor = ArrayIndexer(np.array([[1.2, 3.3], [3.1, 3.2]])) + + # with tuples + X = np.array([[[2, 3], [3, 3]], [[2, 3], [3, 2]]]) + # There are less samples than the max index we want to preprocess + with pytest.raises(PreprocessorError): + preprocess_tuples(X, preprocessor) + + # with points + X = np.array([[1], [2], [3], [3]]) + with pytest.raises(PreprocessorError): + preprocess_points(X, preprocessor) + + @pytest.mark.parametrize('estimator', [ITML(), LSML(), MMC(), SDML()], ids=['ITML', 'LSML', 'MMC', 'SDML']) def test_error_message_tuple_size(estimator): From f2d0cd7f989cc1c41e34cb41b69b2f6982fbe413 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 11:59:54 +0100 Subject: [PATCH 098/120] MAINT: Refactor algorithms preprocessing steps - extract preprocessing steps in main fit function - remove self.X_ when found and replace it by X (Fixes #134) - extract the function to test collapsed pairs as _utils.check_collapsed_pairs and test it - note that the function is now not used where it was used before but the user should be responsible for having coherent input (if he wants he can use the helper function as a preprocessing step) --- metric_learn/_util.py | 8 +++++++ metric_learn/itml.py | 15 +----------- metric_learn/lfda.py | 35 ++++++++++++---------------- metric_learn/lmnn.py | 53 +++++++++++++++++++++++++++---------------- metric_learn/lsml.py | 4 +--- metric_learn/mlkr.py | 27 ++++++++++++++++++---- metric_learn/mmc.py | 25 ++++---------------- metric_learn/rca.py | 35 ++++++++++++---------------- metric_learn/sdml.py | 7 ++---- test/test_utils.py | 25 +++++++++++++++++++- 10 files changed, 126 insertions(+), 108 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index cff70cfd..a1d8fbf0 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -302,3 +302,11 @@ def __init__(self, X): def __call__(self, indices): return self.X[indices] + + +def check_collapsed_pairs(pairs): + num_ident = (vector_norm(pairs[:, 0] - pairs[:, 1]) < 1e-9).sum() + if num_ident: + raise ValueError("{} collapsed pairs found (where the left element is " + "the same as the right element), out of {} pairs " + "in total.".format(num_ident, pairs.shape[0])) diff --git a/metric_learn/itml.py b/metric_learn/itml.py index c450ee71..4ce550fb 100644 --- a/metric_learn/itml.py +++ b/metric_learn/itml.py @@ -59,16 +59,9 @@ def __init__(self, gamma=1., max_iter=1000, convergence_threshold=1e-3, self.verbose = verbose super(_BaseITML, self).__init__(preprocessor) - def _process_pairs(self, pairs, y, bounds): + def _fit(self, pairs, y, bounds=None): pairs, y = self._prepare_inputs(pairs, y, type_of_inputs='tuples') - - # check to make sure that no two constrained vectors are identical - pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] - pos_no_ident = vector_norm(pos_pairs[:, 0, :] - pos_pairs[:, 1, :]) > 1e-9 - pos_pairs = pos_pairs[pos_no_ident] - neg_no_ident = vector_norm(neg_pairs[:, 0, :] - neg_pairs[:, 1, :]) > 1e-9 - neg_pairs = neg_pairs[neg_no_ident] # init bounds if bounds is None: X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])}) @@ -82,12 +75,6 @@ def _process_pairs(self, pairs, y, bounds): self.A_ = np.identity(pairs.shape[2]) else: self.A_ = check_array(self.A0) - pairs = np.vstack([pos_pairs, neg_pairs]) - y = np.hstack([np.ones(len(pos_pairs)), - np.ones(len(neg_pairs))]) - return pairs, y - - def _fit(self, pairs, y, bounds=None): - pairs, y = self._process_pairs(pairs, y, bounds) gamma = self.gamma pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] num_pos = len(pos_pairs) diff --git a/metric_learn/lfda.py b/metric_learn/lfda.py index bd6e57aa..2feff211 100644 --- a/metric_learn/lfda.py +++ b/metric_learn/lfda.py @@ -62,10 +62,20 @@ def __init__(self, num_dims=None, k=None, embedding_type='weighted', self.k = k super(LFDA, self).__init__(preprocessor) - def _process_inputs(self, X, y): + def fit(self, X, y): + '''Fit the LFDA model. + + Parameters + ---------- + X : (n, d) array-like + Input data. + + y : (n,) array-like + Class labels, one per point of data. + ''' + X, y = self._prepare_inputs(X, y, ensure_min_samples=2) unique_classes, y = np.unique(y, return_inverse=True) - self.X_, y = self._prepare_inputs(X, y, ensure_min_samples=2) - n, d = self.X_.shape + n, d = X.shape num_classes = len(unique_classes) if self.num_dims is None: @@ -82,21 +92,6 @@ def _process_inputs(self, X, y): k = d - 1 else: k = int(self.k) - - return self.X_, y, num_classes, n, d, dim, k - - def fit(self, X, y): - '''Fit the LFDA model. - - Parameters - ---------- - X : (n, d) array-like - Input data. - - y : (n,) array-like - Class labels, one per point of data. - ''' - X, y, num_classes, n, d, dim, k_ = self._process_inputs(X, y) tSb = np.zeros((d,d)) tSw = np.zeros((d,d)) @@ -107,8 +102,8 @@ def fit(self, X, y): # classwise affinity matrix dist = pairwise_distances(Xc, metric='l2', squared=True) # distances to k-th nearest neighbor - k = min(k_, nc-1) - sigma = np.sqrt(np.partition(dist, k, axis=0)[:,k]) + k = min(k, nc - 1) + sigma = np.sqrt(np.partition(dist, k, axis=0)[:, k]) local_scale = np.outer(sigma, sigma) with np.errstate(divide='ignore', invalid='ignore'): diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index eb3f129c..81fccd52 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -77,15 +77,30 @@ def fit(self, X, y): learn_rate = self.learn_rate self._check_inputs_params_compatibility(X, y) - target_neighbors = self._select_targets() - impostors = self._find_impostors(target_neighbors[:,-1]) + X, y = self._prepare_inputs(X, y, dtype=float, + ensure_min_samples=2) + num_pts, num_dims = self.X_.shape + unique_labels, self.label_inds_ = np.unique(y, return_inverse=True) + if len(self.label_inds_) != num_pts: + raise ValueError('Must have one label per point.') + self.labels_ = np.arange(len(unique_labels)) + if self.use_pca: + warnings.warn('use_pca does nothing for the python_LMNN implementation') + self.transformer_ = np.eye(num_dims) + required_k = np.bincount(self.label_inds_).min() + if self.k > required_k: + raise ValueError('not enough class labels for specified k' + ' (smallest class has %d)' % required_k) + + target_neighbors = self._select_targets(X) + impostors = self._find_impostors(target_neighbors[:, -1], X) if len(impostors) == 0: # L has already been initialized to an identity matrix return # sum outer products - dfG = _sum_outer_products(self.X_, target_neighbors.flatten(), - np.repeat(np.arange(self.X_.shape[0]), k)) + dfG = _sum_outer_products(X, target_neighbors.flatten(), + np.repeat(np.arange(X.shape[0]), k)) df = np.zeros_like(dfG) # storage @@ -107,7 +122,7 @@ def fit(self, X, y): a2_old = [a.copy() for a in a2] objective_old = objective # Compute pairwise distances under current metric - Lx = L.dot(self.X_.T).T + Lx = L.dot(X.T).T g0 = _inplace_paired_L2(*Lx[impostors]) Ni = 1 + _inplace_paired_L2(Lx[target_neighbors], Lx[:,None,:]) g1,g2 = Ni[impostors] @@ -132,16 +147,16 @@ def fit(self, X, y): targets = target_neighbors[:,nn_idx] PLUS, pweight = _count_edges(plus1, plus2, impostors, targets) - df += _sum_outer_products(self.X_, PLUS[:,0], PLUS[:,1], pweight) + df += _sum_outer_products(X, PLUS[:, 0], PLUS[:, 1], pweight) MINUS, mweight = _count_edges(minus1, minus2, impostors, targets) - df -= _sum_outer_products(self.X_, MINUS[:,0], MINUS[:,1], mweight) + df -= _sum_outer_products(X, MINUS[:, 0], MINUS[:, 1], mweight) in_imp, out_imp = impostors - df += _sum_outer_products(self.X_, in_imp[minus1], out_imp[minus1]) - df += _sum_outer_products(self.X_, in_imp[minus2], out_imp[minus2]) + df += _sum_outer_products(X, in_imp[minus1], out_imp[minus1]) + df += _sum_outer_products(X, in_imp[minus2], out_imp[minus2]) - df -= _sum_outer_products(self.X_, in_imp[plus1], out_imp[plus1]) - df -= _sum_outer_products(self.X_, in_imp[plus2], out_imp[plus2]) + df -= _sum_outer_products(X, in_imp[plus1], out_imp[plus1]) + df -= _sum_outer_products(X, in_imp[plus2], out_imp[plus2]) a1[nn_idx] = act1 a2[nn_idx] = act2 @@ -186,18 +201,18 @@ def fit(self, X, y): self.n_iter_ = it return self - def _select_targets(self): - target_neighbors = np.empty((self.X_.shape[0], self.k), dtype=int) + def _select_targets(self, X): + target_neighbors = np.empty((X.shape[0], self.k), dtype=int) for label in self.labels_: inds, = np.nonzero(self.label_inds_ == label) - dd = euclidean_distances(self.X_[inds], squared=True) + dd = euclidean_distances(X[inds], squared=True) np.fill_diagonal(dd, np.inf) nn = np.argsort(dd)[..., :self.k] target_neighbors[inds] = inds[nn] return target_neighbors - def _find_impostors(self, furthest_neighbors): - Lx = self.transform(self.X_) + def _find_impostors(self, furthest_neighbors, X): + Lx = self.transform(X) margin_radii = 1 + _inplace_paired_L2(Lx[furthest_neighbors], Lx) impostors = [] for label in self.labels_[:-1]: @@ -260,10 +275,10 @@ class LMNN(_base_LMNN): """ def fit(self, X, y): - self.X_, y = self._prepare_inputs(X, y, dtype=float, - ensure_min_samples=2) + X, y = self._prepare_inputs(X, y, dtype=float, + ensure_min_samples=2) labels = MulticlassLabels(y) - self._lmnn = shogun_LMNN(RealFeatures(self.X_.T), labels, self.k) + self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.k) self._lmnn.set_maxiter(self.max_iter) self._lmnn.set_obj_threshold(self.convergence_tol) self._lmnn.set_regularization(self.regularization) diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index 11147965..cd28d4b9 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -43,7 +43,7 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, self.verbose = verbose super(_BaseLSML, self).__init__(preprocessor) - def _prepare_quadruplets(self, quadruplets, weights): + def _fit(self, quadruplets, weights=None): quadruplets = self._prepare_inputs(quadruplets, type_of_inputs='tuples') @@ -66,8 +66,6 @@ def _prepare_quadruplets(self, quadruplets, weights): self.M_ = self.prior self.prior_inv_ = np.linalg.inv(self.prior) - def _fit(self, quadruplets, weights=None): - self._prepare_quadruplets(quadruplets, weights) step_sizes = np.logspace(-10, 0, 10) # Keep track of the best step size and the loss at that step. l_best = 0 diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 5dc57026..1839f533 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -62,8 +62,8 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, super(MLKR, self).__init__(preprocessor) def _process_inputs(self, X, y): - self.X_, y = self._prepare_inputs(X, y, y_numeric=True, - ensure_min_samples=2) + X, y = self._prepare_inputs(X, y, y_numeric=True, + ensure_min_samples=2) n, d = self.X_.shape if y.shape[0] != n: raise ValueError('Data and label lengths mismatch: %d != %d' @@ -76,11 +76,11 @@ def _process_inputs(self, X, y): if A is None: # initialize to PCA transformation matrix # note: not the same as n_components=m ! - A = PCA().fit(self.X_).components_.T[:m] + A = PCA().fit(X).components_.T[:m] elif A.shape != (m, d): raise ValueError('A0 needs shape (%d,%d) but got %s' % ( m, d, A.shape)) - return self.X_, y, A + return X, y, A def fit(self, X, y): """ @@ -91,7 +91,24 @@ def fit(self, X, y): X : (n x d) array of samples y : (n) data labels """ - X, y, A = self._process_inputs(X, y) + X, y = self._prepare_inputs(X, y, y_numeric=True, + ensure_min_samples=2) + n, d = X.shape + if y.shape[0] != n: + raise ValueError('Data and label lengths mismatch: %d != %d' + % (n, y.shape[0])) + + A = self.A0 + m = self.num_dims + if m is None: + m = d + if A is None: + # initialize to PCA transformation matrix + # note: not the same as n_components=m ! + A = PCA().fit(X).components_.T[:m] + elif A.shape != (m, d): + raise ValueError('A0 needs shape (%d,%d) but got %s' % ( + m, d, A.shape)) # note: this line takes (n*n*d) memory! # for larger datasets, we'll need to compute dX as we go diff --git a/metric_learn/mmc.py b/metric_learn/mmc.py index 937baaad..89c18b58 100644 --- a/metric_learn/mmc.py +++ b/metric_learn/mmc.py @@ -66,27 +66,9 @@ def __init__(self, max_iter=100, max_proj=10000, convergence_threshold=1e-3, super(_BaseMMC, self).__init__(preprocessor) def _fit(self, pairs, y): - pairs, y = self._process_pairs(pairs, y) - if self.diagonal: - return self._fit_diag(pairs, y) - else: - return self._fit_full(pairs, y) - - def _process_pairs(self, pairs, y): pairs, y = self._prepare_inputs(pairs, y, type_of_inputs='tuples') - # check to make sure that no two constrained vectors are identical - pos_pairs, neg_pairs = pairs[y == 1], pairs[y == -1] - pos_no_ident = vector_norm(pos_pairs[:, 0, :] - pos_pairs[:, 1, :]) > 1e-9 - pos_pairs = pos_pairs[pos_no_ident] - neg_no_ident = vector_norm(neg_pairs[:, 0, :] - neg_pairs[:, 1, :]) > 1e-9 - neg_pairs = neg_pairs[neg_no_ident] - if len(pos_pairs) == 0: - raise ValueError('No non-trivial similarity constraints given for MMC.') - if len(neg_pairs) == 0: - raise ValueError('No non-trivial dissimilarity constraints given for MMC.') - # init metric if self.A0 is None: self.A_ = np.identity(pairs.shape[2]) @@ -97,9 +79,10 @@ def _process_pairs(self, pairs, y): else: self.A_ = check_array(self.A0) - pairs = np.vstack([pos_pairs, neg_pairs]) - y = np.hstack([np.ones(len(pos_pairs)), - np.ones(len(neg_pairs))]) - return pairs, y + if self.diagonal: + return self._fit_diag(pairs, y) + else: + return self._fit_full(pairs, y) def _fit_full(self, pairs, y): """Learn full metric using MMC. diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 32a4ceee..290ea941 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -67,20 +67,6 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): self.pca_comps = pca_comps super(RCA, self).__init__(preprocessor) - def _process_data(self, X): - X = self._prepare_inputs(X, ensure_min_samples=2) - - # PCA projection to remove noise and redundant information. - if self.pca_comps is not None: - pca = decomposition.PCA(n_components=self.pca_comps) - X_transformed = pca.fit_transform(X) - M_pca = pca.components_ - else: - X_transformed = X - X.mean(axis=0) - M_pca = None - - return X_transformed, M_pca - def _check_dimension(self, rank, X): d = X.shape[1] if rank < d: @@ -101,7 +87,7 @@ def _check_dimension(self, rank, X): dim = self.num_dims return dim - def fit(self, data, chunks): + def fit(self, X, chunks): """Learn the RCA model. Parameters @@ -112,17 +98,26 @@ def fit(self, data, chunks): When ``chunks[i] == -1``, point i doesn't belong to any chunklet. When ``chunks[i] == j``, point i belongs to chunklet j. """ - data, M_pca = self._process_data(data) + X = self._prepare_inputs(X, ensure_min_samples=2) + + # PCA projection to remove noise and redundant information. + if self.pca_comps is not None: + pca = decomposition.PCA(n_components=self.pca_comps) + X_t = pca.fit_transform(X) + M_pca = pca.components_ + else: + X_t = X - X.mean(axis=0) + M_pca = None chunks = np.asanyarray(chunks, dtype=int) - chunk_mask, chunked_data = _chunk_mean_centering(data, chunks) + chunk_mask, chunked_data = _chunk_mean_centering(X_t, chunks) inner_cov = np.cov(chunked_data, rowvar=0, bias=1) - dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), data) + dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X_t) # Fisher Linear Discriminant projection - if dim < data.shape[1]: - total_cov = np.cov(data[chunk_mask], rowvar=0) + if dim < X_t.shape[1]: + total_cov = np.cov(X_t[chunk_mask], rowvar=0) tmp = np.linalg.lstsq(total_cov, inner_cov)[0] vals, vecs = np.linalg.eig(tmp) inds = np.argsort(vals)[:dim] diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index dca083d0..b7f36bf4 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -49,7 +49,7 @@ def __init__(self, balance_param=0.5, sparsity_param=0.01, use_cov=True, self.verbose = verbose super(_BaseSDML, self).__init__(preprocessor) - def _prepare_pairs(self, pairs, y): + def _fit(self, pairs, y): pairs, y = self._prepare_inputs(pairs, y, type_of_inputs='tuples') @@ -60,10 +60,7 @@ def _prepare_pairs(self, pairs, y): else: self.M_ = np.identity(pairs.shape[2]) diff = pairs[:, 0] - pairs[:, 1] - return (diff.T * y).dot(diff) - - def _fit(self, pairs, y): - loss_matrix = self._prepare_pairs(pairs, y) + loss_matrix = (diff.T * y).dot(diff) P = self.M_ + self.balance_param * loss_matrix emp_cov = pinvh(P) # hack: ensure positive semidefinite diff --git a/test/test_utils.py b/test/test_utils.py index 78f93111..4b9497a2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -7,7 +7,8 @@ from sklearn.utils.testing import set_random_state from sklearn.base import clone from metric_learn._util import (check_input, make_context, preprocess_tuples, - make_name, preprocess_points) + make_name, preprocess_points, + check_collapsed_pairs) from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised) @@ -949,3 +950,25 @@ def test_same_with_or_without_preprocessor_tuples(estimator, build_dataset): output_without_prep = estimator_with_prep_formed.transform( formed_tuples_test[:, 0]) assert np.array(output_with_prep == output_without_prep).all() + + +def test_check_collapsed_pairs_raises_no_error(): + """Checks that check_collapsed_pairs raises no error if no collapsed pairs + is present""" + pairs_ok = np.array([[[0.1, 3.3], [3.3, 0.1]], + [[0.1, 3.3], [3.3, 0.1]], + [[2.5, 8.1], [0.1, 3.3]]]) + check_collapsed_pairs(pairs_ok) + + +def test_check_collapsed_pairs_raises_error(): + """Checks that check_collapsed_pairs raises no error if no collapsed pairs + is present""" + pairs_not_ok = np.array([[[0.1, 3.3], [0.1, 3.3]], + [[0.1, 3.3], [3.3, 0.1]], + [[2.5, 8.1], [2.5, 8.1]]]) + with pytest.raises(ValueError) as e: + check_collapsed_pairs(pairs_not_ok) + assert str(e.value) == ("2 collapsed pairs found (where the left element is " + "the same as the right element), out of 3 pairs in" + " total.") From 02e82ffbdb790cff2565cb6f22249130c1388ce1 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 12:08:17 +0100 Subject: [PATCH 099/120] MAINT: finish the work of the previous commit --- metric_learn/lmnn.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/metric_learn/lmnn.py b/metric_learn/lmnn.py index 81fccd52..d78cf6b6 100644 --- a/metric_learn/lmnn.py +++ b/metric_learn/lmnn.py @@ -52,34 +52,14 @@ def __init__(self, k=3, min_iter=50, max_iter=1000, learn_rate=1e-7, # slower Python version class python_LMNN(_base_LMNN): - def _check_inputs_params_compatibility(self, X, labels): - """Process inputs and raise appropriate error messages if input - parameters are not those expected with respect to inputs. - """ - self.X_, labels = self._prepare_inputs(X, labels, dtype=float, - ensure_min_samples=2) - num_pts, num_dims = self.X_.shape - unique_labels, self.label_inds_ = np.unique(labels, return_inverse=True) - if len(self.label_inds_) != num_pts: - raise ValueError('Must have one label per point.') - self.labels_ = np.arange(len(unique_labels)) - if self.use_pca: - warnings.warn('use_pca does nothing for the python_LMNN implementation') - self.transformer_ = np.eye(num_dims) - required_k = np.bincount(self.label_inds_).min() - if self.k > required_k: - raise ValueError('not enough class labels for specified k' - ' (smallest class has %d)' % required_k) - def fit(self, X, y): k = self.k reg = self.regularization learn_rate = self.learn_rate - self._check_inputs_params_compatibility(X, y) X, y = self._prepare_inputs(X, y, dtype=float, ensure_min_samples=2) - num_pts, num_dims = self.X_.shape + num_pts, num_dims = X.shape unique_labels, self.label_inds_ = np.unique(y, return_inverse=True) if len(self.label_inds_) != num_pts: raise ValueError('Must have one label per point.') From 99206b38a162244352d86231c50514603531a191 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 18:41:22 +0100 Subject: [PATCH 100/120] TST: add test for cross-validation: comparison of manual cross-val and scikit-learn cross-val --- test/test_sklearn_compat.py | 60 +++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index eb542567..d2bb5f49 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -17,7 +17,7 @@ from sklearn import clone import numpy as np from sklearn.model_selection import (cross_val_score, cross_val_predict, - train_test_split) + train_test_split, KFold) from sklearn.utils.testing import _get_args @@ -189,8 +189,8 @@ def build_quadruplets(preprocessor): @pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_cross_validation(estimator, build_dataset, preprocessor): - """Tests that you can do cross validation on metric-learn estimators +def test_cross_validation_is_finite(estimator, build_dataset, preprocessor): + """Tests that validation on metric-learn estimators returns something finite """ if any(hasattr(estimator, method) for method in ["predict", "score"]): (X, tuples, y, tuples_train, tuples_test, @@ -204,6 +204,60 @@ def test_cross_validation(estimator, build_dataset, preprocessor): assert np.isfinite(cross_val_predict(estimator, tuples, y)).all() +@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_cross_validation_manual_vs_scikit(estimator, build_dataset, + preprocessor): + """Tests that if we make a manual cross-validation, the result will be the + same as scikit-learn's cross-validation (some code for generating the + folds is taken from scikit-learn). + """ + if any(hasattr(estimator, method) for method in ["predict", "score"]): + (X, tuples, y, tuples_train, tuples_test, + y_train, y_test, preprocessor) = build_dataset(preprocessor) + y = y.ravel() if y is not None else None # The build dataset functions + # returned a vertical vector + estimator = clone(estimator) + estimator.set_params(preprocessor=preprocessor) + set_random_state(estimator) + n_splits = 3 + kfold = KFold(shuffle=False, n_splits=n_splits) + n_samples = tuples.shape[0] + fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int) + fold_sizes[:n_samples % n_splits] += 1 + current = 0 + scores, predictions = [], np.zeros(tuples.shape[0]) + if hasattr(estimator, "score"): + for fold_size in fold_sizes: + start, stop = current, current + fold_size + current = stop + test_slice = slice(start, stop) + train_mask = np.ones(tuples.shape[0], bool) + train_mask[test_slice] = False + if y is not None: # for now SDML has no default for y + estimator.fit(tuples[train_mask], y[train_mask]) + if hasattr(estimator, "score"): + scores.append(estimator.score(tuples[test_slice], y[test_slice])) + if hasattr(estimator, "predict"): + predictions[test_slice] = estimator.predict(tuples[test_slice]) + else: + estimator.fit(tuples[train_mask]) + if hasattr(estimator, "score"): + scores.append(estimator.score(tuples[test_slice])) + if hasattr(estimator, "predict"): + predictions[test_slice] = estimator.predict(tuples[test_slice]) + if hasattr(estimator, "score"): + assert all(scores == cross_val_score(estimator, tuples, y, cv=kfold)) + if hasattr(estimator, "predict"): + if y is not None: + assert all(predictions == cross_val_predict(estimator, tuples, y, + cv=kfold)) + else: + assert all(predictions == cross_val_predict(estimator, tuples, + cv=kfold)) + + def check_score(estimator, tuples, y): if hasattr(estimator, "score"): score = estimator.score(tuples, y) From 8ee08b80ca7d7778d451340d4bcca0c2bc52643d Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 19:05:38 +0100 Subject: [PATCH 101/120] ENH: put y=None by default in LSML for better compatibility. This also allows to simplify some tests by removing the need for a separate case for lsml --- metric_learn/lsml.py | 2 +- test/test_sklearn_compat.py | 27 +++++++++------------------ test/test_utils.py | 15 +++------------ 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/metric_learn/lsml.py b/metric_learn/lsml.py index cd28d4b9..cb2c1f18 100644 --- a/metric_learn/lsml.py +++ b/metric_learn/lsml.py @@ -43,7 +43,7 @@ def __init__(self, tol=1e-3, max_iter=1000, prior=None, verbose=False, self.verbose = verbose super(_BaseLSML, self).__init__(preprocessor) - def _fit(self, quadruplets, weights=None): + def _fit(self, quadruplets, y=None, weights=None): quadruplets = self._prepare_inputs(quadruplets, type_of_inputs='tuples') diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index d2bb5f49..e5ff3f8a 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -235,27 +235,18 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset, test_slice = slice(start, stop) train_mask = np.ones(tuples.shape[0], bool) train_mask[test_slice] = False - if y is not None: # for now SDML has no default for y - estimator.fit(tuples[train_mask], y[train_mask]) - if hasattr(estimator, "score"): - scores.append(estimator.score(tuples[test_slice], y[test_slice])) - if hasattr(estimator, "predict"): - predictions[test_slice] = estimator.predict(tuples[test_slice]) - else: - estimator.fit(tuples[train_mask]) - if hasattr(estimator, "score"): - scores.append(estimator.score(tuples[test_slice])) - if hasattr(estimator, "predict"): - predictions[test_slice] = estimator.predict(tuples[test_slice]) + (y_train, y_test) = ((y[train_mask], y[test_slice]) if y is not None + else (None, None)) + estimator.fit(tuples[train_mask], y_train) + if hasattr(estimator, "score"): + scores.append(estimator.score(tuples[test_slice], y_test)) + if hasattr(estimator, "predict"): + predictions[test_slice] = estimator.predict(tuples[test_slice]) if hasattr(estimator, "score"): assert all(scores == cross_val_score(estimator, tuples, y, cv=kfold)) if hasattr(estimator, "predict"): - if y is not None: - assert all(predictions == cross_val_predict(estimator, tuples, y, - cv=kfold)) - else: - assert all(predictions == cross_val_predict(estimator, tuples, - cv=kfold)) + assert all(predictions == cross_val_predict(estimator, tuples, y, + cv=kfold)) def check_score(estimator, tuples, y): diff --git a/test/test_utils.py b/test/test_utils.py index 4b9497a2..184bcff1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -890,26 +890,17 @@ def test_same_with_or_without_preprocessor_tuples(estimator, build_dataset): estimator_with_preprocessor = clone(estimator) set_random_state(estimator_with_preprocessor) estimator_with_preprocessor.set_params(preprocessor=X) - if estimator.__class__.__name__ == 'LSML': - estimator_with_preprocessor.fit(tuples_train) - else: - estimator_with_preprocessor.fit(tuples_train, y_train) + estimator_with_preprocessor.fit(tuples_train, y_train) estimator_without_preprocessor = clone(estimator) set_random_state(estimator_without_preprocessor) estimator_without_preprocessor.set_params(preprocessor=None) - if estimator.__class__.__name__ == 'LSML': - estimator_without_preprocessor.fit(formed_tuples_train) - else: - estimator_without_preprocessor.fit(formed_tuples_train, y_train) + estimator_without_preprocessor.fit(formed_tuples_train, y_train) estimator_with_prep_formed = clone(estimator) set_random_state(estimator_with_prep_formed) estimator_with_prep_formed.set_params(preprocessor=X) - if estimator.__class__.__name__ == 'LSML': - estimator_with_prep_formed.fit(tuples_train) - else: - estimator_with_prep_formed.fit(tuples_train, y_train) + estimator_with_prep_formed.fit(tuples_train, y_train) # test prediction methods for method in ["predict", "decision_function"]: From 784f697d6c290edc655dd621904a0100c9e7d32e Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 19:18:01 +0100 Subject: [PATCH 102/120] ENH: add error message when type of inputs is not some expected type --- metric_learn/_util.py | 4 ++++ test/test_utils.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index a1d8fbf0..2f16c3ed 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -137,6 +137,10 @@ def check_input(input_data, y=None, preprocessor=None, input_data = check_input_tuples(input_data, context, preprocessor, args_for_sk_checks, tuple_size) + else: + raise ValueError("Unknown value {} for type_of_inputs. Valid values are " + "'classic' or 'tuples'.".format(type_of_inputs)) + return input_data if y is None else (input_data, y) diff --git a/test/test_utils.py b/test/test_utils.py index 184bcff1..7e46897f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -23,6 +23,18 @@ def mock_preprocessor(indices): """ return np.ones((indices.shape[0], 3)) + +@pytest.mark.parametrize('type_of_inputs', ['other', 'tuple', 'classics', 2, + int, NCA()]) +def test_check_input_invalid_type_of_inputs(type_of_inputs): + """Tests that an invalid type of inputs in check_inputs raises an error.""" + with pytest.raises(ValueError) as e: + check_input([[0.2, 2.1], [0.2, .8]], type_of_inputs=type_of_inputs) + msg = ("Unknown value {} for type_of_inputs. Valid values are " + "'classic' or 'tuples'.".format(type_of_inputs)) + assert str(e.value) == msg + + # ---------------- test check_input with 'tuples' type_of_input' ------------ From c46bbe1c6837edecced25868a7bc1677817c12c9 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 19:25:19 +0100 Subject: [PATCH 103/120] TST: add test that checks that 'classic' is the default behaviour --- test/test_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 7e46897f..c6124287 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -480,6 +480,12 @@ def test_check_classic_valid_without_preprocessor(points): assert len(record) == 0 +def test_check_classic_by_default(): + """Checks that 'classic' is the default behaviour of check_input""" + assert (check_input([[2, 3], [3, 2]]) == + check_input([[2, 3], [3, 2]], type_of_inputs='classic')).all() + + def test_check_classic_behaviour_auto_dtype(points_no_prep): """Checks that check_input (for points) allows by default every type if using a preprocessor, and numeric types if using no preprocessor""" From 39a725619882c850dfe6b0105c010d7e608b1590 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 19:33:03 +0100 Subject: [PATCH 104/120] TST: remove unnecessary conversion to vertical vector of y --- test/test_sklearn_compat.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index e5ff3f8a..a5bd664b 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -216,8 +216,6 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset, if any(hasattr(estimator, method) for method in ["predict", "score"]): (X, tuples, y, tuples_train, tuples_test, y_train, y_test, preprocessor) = build_dataset(preprocessor) - y = y.ravel() if y is not None else None # The build dataset functions - # returned a vertical vector estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) From 082bca51bad940961a879e7bf277ce25cb756156 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 28 Nov 2018 19:34:47 +0100 Subject: [PATCH 105/120] FIX: remove wrong condition hasattr 'score' at top of loop --- test/test_sklearn_compat.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index a5bd664b..eb4129a5 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -226,25 +226,24 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset, fold_sizes[:n_samples % n_splits] += 1 current = 0 scores, predictions = [], np.zeros(tuples.shape[0]) - if hasattr(estimator, "score"): - for fold_size in fold_sizes: - start, stop = current, current + fold_size - current = stop - test_slice = slice(start, stop) - train_mask = np.ones(tuples.shape[0], bool) - train_mask[test_slice] = False - (y_train, y_test) = ((y[train_mask], y[test_slice]) if y is not None - else (None, None)) - estimator.fit(tuples[train_mask], y_train) - if hasattr(estimator, "score"): - scores.append(estimator.score(tuples[test_slice], y_test)) - if hasattr(estimator, "predict"): - predictions[test_slice] = estimator.predict(tuples[test_slice]) + for fold_size in fold_sizes: + start, stop = current, current + fold_size + current = stop + test_slice = slice(start, stop) + train_mask = np.ones(tuples.shape[0], bool) + train_mask[test_slice] = False + (y_train, y_test) = ((y[train_mask], y[test_slice]) if y is not None + else (None, None)) + estimator.fit(tuples[train_mask], y_train) if hasattr(estimator, "score"): - assert all(scores == cross_val_score(estimator, tuples, y, cv=kfold)) + scores.append(estimator.score(tuples[test_slice], y_test)) if hasattr(estimator, "predict"): - assert all(predictions == cross_val_predict(estimator, tuples, y, - cv=kfold)) + predictions[test_slice] = estimator.predict(tuples[test_slice]) + if hasattr(estimator, "score"): + assert all(scores == cross_val_score(estimator, tuples, y, cv=kfold)) + if hasattr(estimator, "predict"): + assert all(predictions == cross_val_predict(estimator, tuples, y, + cv=kfold)) def check_score(estimator, tuples, y): From 6abbcd6bf1de160185fbb5c415025d4d90f47f4b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 29 Nov 2018 09:58:07 +0100 Subject: [PATCH 106/120] MAINT: Add comment to explain why we return twice X for build_regression and build_classification --- test/test_sklearn_compat.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index eb4129a5..37952268 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -103,6 +103,9 @@ def build_classification(preprocessor): # builds a toy classification problem X, y = shuffle(*make_blobs(), random_state=RNG) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + # we will return twice X here for compatibility with the rest of the tests + # (the first X is the original dataset and the second X represent the + # tuples given to the models (which are in fact the original dataset here) return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) @@ -111,6 +114,9 @@ def build_regression(preprocessor): X, y = shuffle(*make_regression(n_samples=100, n_features=10), random_state=RNG) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) + # we will return twice X here for compatibility with the rest of the tests + # (the first X is the original dataset and the second X represent the + # tuples given to the models (which are in fact the original dataset here) return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) From f0a1dc2513896fcdeae596deb658123e3f8de074 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 29 Nov 2018 13:38:03 +0100 Subject: [PATCH 107/120] ENH: improve test for preprocessor and return error message if the given preprocessor has the bad type --- metric_learn/_util.py | 12 ++++++++++ metric_learn/base_metric.py | 6 ++++- test/test_utils.py | 45 ++++++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 2f16c3ed..55203fc8 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -302,6 +302,18 @@ def check_tuple_size(tuples, tuple_size, context): class ArrayIndexer: def __init__(self, X): + # we check the array-like preprocessor here, and we as much permissive + # as possible (because the user will check for the desired + # format with arguments in check_input, and only this latter function + # should return the appropriate errors). We do this only to have a numpy + # array object which can be indexed by another numpy array object. + X = check_array(X, + accept_sparse=True, dtype=None, + force_all_finite=False, + ensure_2d=False, allow_nd=True, + ensure_min_samples=0, + ensure_min_features=0, + warn_on_dtype=False, estimator=None) self.X = X def __call__(self, indices): diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index a4214bd8..613613b1 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -41,8 +41,12 @@ def check_preprocessor(self): """Initializes the preprocessor""" if _is_arraylike(self.preprocessor): self.preprocessor_ = ArrayIndexer(self.preprocessor) - else: + elif callable(self.preprocessor): self.preprocessor_ = self.preprocessor + else: + raise ValueError("Invalid type for the preprocessor: {}. You should " + "provide either an array-like object, " + "or a callable.".format(type(self.preprocessor))) def _prepare_inputs(self, X, y=None, type_of_inputs='classic', **kwargs): diff --git a/test/test_utils.py b/test/test_utils.py index c6124287..0ff09800 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -12,7 +12,7 @@ from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised) -from metric_learn.base_metric import ArrayIndexer +from metric_learn.base_metric import ArrayIndexer, MahalanobisMixin from metric_learn.exceptions import PreprocessorError from sklearn.datasets import make_regression, make_blobs from .test_sklearn_compat import build_pairs, build_quadruplets @@ -700,6 +700,49 @@ def test_preprocessor_error_message(): preprocess_points(X, preprocessor) +@pytest.mark.parametrize('input_data', [[[5, 3], [3, 2]], + ((5, 3), (3, 2)) + ]) +@pytest.mark.parametrize('indices', [[0, 1], (1, 0)]) +def test_array_like_indexer_array_like_valid_classic(input_data, indices): + """Checks that any array-like is valid in the 'preprocessor' argument, + and in the indices, for a classic input""" + class MockMetricLearner(MahalanobisMixin): + pass + + mock_algo = MockMetricLearner(preprocessor=input_data) + mock_algo._prepare_inputs(indices, type_of_inputs='classic') + + +@pytest.mark.parametrize('input_data', [[[5, 3], [3, 2]], + ((5, 3), (3, 2)) + ]) +@pytest.mark.parametrize('indices', [[[0, 1], [1, 0]], ((1, 0), (1, 0))]) +def test_array_like_indexer_array_like_valid_tuples(input_data, indices): + """Checks that any array-like is valid in the 'preprocessor' argument, + and in the indices, for a classic input""" + class MockMetricLearner(MahalanobisMixin): + pass + + mock_algo = MockMetricLearner(preprocessor=input_data) + mock_algo._prepare_inputs(indices, type_of_inputs='tuples') + + +@pytest.mark.parametrize('preprocessor', [4, NCA()]) +def test_error_message_check_preprocessor(preprocessor): + """Checks that if the preprocessor given is not an array-like or a + callable, the right error message is returned""" + class MockMetricLearner(MahalanobisMixin): + pass + + mock_algo = MockMetricLearner(preprocessor=preprocessor) + with pytest.raises(ValueError) as e: + mock_algo.check_preprocessor() + assert str(e.value) == ("Invalid type for the preprocessor: {}. You should " + "provide either an array-like object, " + "or a callable.".format(type(preprocessor))) + + @pytest.mark.parametrize('estimator', [ITML(), LSML(), MMC(), SDML()], ids=['ITML', 'LSML', 'MMC', 'SDML']) def test_error_message_tuple_size(estimator): From ab5f2e311dab5fc98852f2cb41578971266aded3 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 29 Nov 2018 13:42:42 +0100 Subject: [PATCH 108/120] FIX: fix wrong type_of_inputs in a test --- test/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0ff09800..9d21ab72 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -496,7 +496,7 @@ def test_check_classic_behaviour_auto_dtype(points_no_prep): assert len(record) == 0 with pytest.warns(None) as record: - check_input(points_no_prep, type_of_inputs='input') # numeric type + check_input(points_no_prep, type_of_inputs='classic') # numeric type assert len(record) == 0 # not numeric type From 5324e852ed68d2624f077d4b37c1c9d35b6145a2 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 29 Nov 2018 14:04:37 +0100 Subject: [PATCH 109/120] FIX: deal with the case where preprocessor is None --- metric_learn/base_metric.py | 4 ++-- test/test_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index 613613b1..9af79ecc 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -41,11 +41,11 @@ def check_preprocessor(self): """Initializes the preprocessor""" if _is_arraylike(self.preprocessor): self.preprocessor_ = ArrayIndexer(self.preprocessor) - elif callable(self.preprocessor): + elif callable(self.preprocessor) or self.preprocessor is None: self.preprocessor_ = self.preprocessor else: raise ValueError("Invalid type for the preprocessor: {}. You should " - "provide either an array-like object, " + "provide either None, an array-like object, " "or a callable.".format(type(self.preprocessor))) def _prepare_inputs(self, X, y=None, type_of_inputs='classic', diff --git a/test/test_utils.py b/test/test_utils.py index 9d21ab72..ee160097 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -739,7 +739,7 @@ class MockMetricLearner(MahalanobisMixin): with pytest.raises(ValueError) as e: mock_algo.check_preprocessor() assert str(e.value) == ("Invalid type for the preprocessor: {}. You should " - "provide either an array-like object, " + "provide either None, an array-like object, " "or a callable.".format(type(preprocessor))) From 48bce7da92dc42d15493fe732a30a201d2429eda Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Mon, 3 Dec 2018 16:13:10 +0100 Subject: [PATCH 110/120] WIP refactor build_dataset --- test/test_mahalanobis_mixin.py | 87 ++--------- test/test_sklearn_compat.py | 160 ++++---------------- test/test_utils.py | 264 +++++++++++++++++++++------------ 3 files changed, 204 insertions(+), 307 deletions(-) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index b8ebe0f8..bcf3ec11 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -5,79 +5,21 @@ from numpy.testing import assert_array_almost_equal from scipy.spatial.distance import pdist, squareform from sklearn import clone -from sklearn.datasets import load_iris -from sklearn.utils import check_random_state, shuffle +from sklearn.utils import check_random_state from sklearn.utils.testing import set_random_state -from metric_learn import (Constraints, ITML, LSML, MMC, SDML, Covariance, LFDA, - LMNN, MLKR, NCA, RCA) from metric_learn._util import make_context -from metric_learn.constraints import wrap_pairs -from functools import partial -RNG = check_random_state(0) +from test.test_utils import ids_estimators, list_estimators -def build_data(): - dataset = load_iris() - X, y = shuffle(dataset.data, dataset.target, random_state=RNG) - num_constraints = 20 - constraints = Constraints.random_subset(y, random_state=RNG) - pairs = constraints.positive_negative_pairs(num_constraints, - same_length=True, - random_state=RNG) - return X, pairs - - -def build_pairs(): - # test that you can do cross validation on tuples of points with - # a WeaklySupervisedMetricLearner - X, pairs = build_data() - pairs, y = wrap_pairs(X, pairs) - pairs, y = shuffle(pairs, y, random_state=RNG) - return pairs, y - - -def build_quadruplets(): - # test that you can do cross validation on a tuples of points with - # a WeaklySupervisedMetricLearner - X, pairs = build_data() - c = np.column_stack(pairs) - quadruplets = X[c] - quadruplets = shuffle(quadruplets, random_state=RNG) - return quadruplets, None - - -list_estimators = [(Covariance(), build_data), - (ITML(), build_pairs), - (LFDA(), partial(load_iris, return_X_y=True)), - (LMNN(), partial(load_iris, return_X_y=True)), - (LSML(), build_quadruplets), - (MLKR(), partial(load_iris, return_X_y=True)), - (MMC(), build_pairs), - (NCA(), partial(load_iris, return_X_y=True)), - (RCA(), partial(load_iris, return_X_y=True)), - (SDML(), build_pairs) - ] - -ids_estimators = ['covariance', - 'itml', - 'lfda', - 'lmnn', - 'lsml', - 'mlkr', - 'mmc', - 'nca', - 'rca', - 'sdml', - ] +RNG = check_random_state(0) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_score_pairs_pairwise(estimator, build_dataset): # Computing pairwise scores should return a euclidean distance matrix. - input_data, labels = build_dataset() - X, _ = load_iris(return_X_y=True) + input_data, labels, _, X = build_dataset() n_samples = 20 X = X[:n_samples] model = clone(estimator) @@ -101,8 +43,7 @@ def test_score_pairs_pairwise(estimator, build_dataset): ids=ids_estimators) def test_score_pairs_toy_example(estimator, build_dataset): # Checks that score_pairs works on a toy example - input_data, labels = build_dataset() - X, _ = load_iris(return_X_y=True) + input_data, labels, _, X = build_dataset() n_samples = 20 X = X[:n_samples] model = clone(estimator) @@ -120,11 +61,10 @@ def test_score_pairs_toy_example(estimator, build_dataset): ids=ids_estimators) def test_score_pairs_finite(estimator, build_dataset): # tests that the score is finite - input_data, labels = build_dataset() + input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) - X, _ = load_iris(return_X_y=True) pairs = np.array(list(product(X, X))) assert np.isfinite(model.score_pairs(pairs)).all() @@ -135,11 +75,10 @@ def test_score_pairs_dim(estimator, build_dataset): # scoring of 3D arrays should return 1D array (several tuples), # and scoring of 2D arrays (one tuple) should return an error (like # scikit-learn's error when scoring 1D arrays) - input_data, labels = build_dataset() + input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) - X, _ = load_iris(return_X_y=True) tuples = np.array(list(product(X, X))) assert model.score_pairs(tuples).shape == (tuples.shape[0],) context = make_context(estimator) @@ -165,8 +104,7 @@ def check_is_distance_matrix(pairwise): ids=ids_estimators) def test_embed_toy_example(estimator, build_dataset): # Checks that embed works on a toy example - input_data, labels = build_dataset() - X, _ = load_iris(return_X_y=True) + input_data, labels, _, X = build_dataset() n_samples = 20 X = X[:n_samples] model = clone(estimator) @@ -180,11 +118,10 @@ def test_embed_toy_example(estimator, build_dataset): ids=ids_estimators) def test_embed_dim(estimator, build_dataset): # Checks that the the dimension of the output space is as expected - input_data, labels = build_dataset() + input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) - X, _ = load_iris(return_X_y=True) assert model.transform(X).shape == X.shape # assert that ValueError is thrown if input shape is 1D @@ -210,11 +147,10 @@ def test_embed_dim(estimator, build_dataset): ids=ids_estimators) def test_embed_finite(estimator, build_dataset): # Checks that embed returns vectors with finite values - input_data, labels = build_dataset() + input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) - X, _ = load_iris(return_X_y=True) assert np.isfinite(model.transform(X)).all() @@ -222,11 +158,10 @@ def test_embed_finite(estimator, build_dataset): ids=ids_estimators) def test_embed_is_linear(estimator, build_dataset): # Checks that the embedding is linear - input_data, labels = build_dataset() + input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) - X, _ = load_iris(return_X_y=True) assert_array_almost_equal(model.transform(X[:10] + X[10:20]), model.transform(X[:10]) + model.transform(X[10:20])) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 37952268..89100054 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -2,23 +2,22 @@ import unittest from sklearn.utils.estimator_checks import check_estimator from sklearn.base import TransformerMixin -from sklearn.datasets import load_iris, make_regression, make_blobs from sklearn.pipeline import make_pipeline -from sklearn.utils import shuffle, check_random_state +from sklearn.utils import check_random_state from sklearn.utils.estimator_checks import is_public_parameter from sklearn.utils.testing import (assert_allclose_dense_sparse, set_random_state) -from sklearn.utils.fixes import signature -from metric_learn import (Covariance, ITML, LFDA, LMNN, LSML, MLKR, MMC, NCA, - RCA, SDML, ITML_Supervised, LSML_Supervised, +from metric_learn import (Covariance, LFDA, LMNN, MLKR, NCA, + ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised) -from metric_learn.constraints import wrap_pairs, Constraints from sklearn import clone import numpy as np from sklearn.model_selection import (cross_val_score, cross_val_predict, train_test_split, KFold) from sklearn.utils.testing import _get_args +from test.test_utils import (list_estimators, ids_estimators, + mock_preprocessor) # Wrap the _Supervised methods with a deterministic wrapper for testing. @@ -88,119 +87,14 @@ def test_mmc(self): # ---------------------- Test scikit-learn compatibility ---------------------- -def build_data(): - dataset = load_iris() - X, y = shuffle(dataset.data, dataset.target, random_state=RNG) - num_constraints = 50 - constraints = Constraints.random_subset(y, random_state=RNG) - pairs = constraints.positive_negative_pairs(num_constraints, - same_length=True, - random_state=RNG) - return X, pairs - - -def build_classification(preprocessor): - # builds a toy classification problem - X, y = shuffle(*make_blobs(), random_state=RNG) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) - # we will return twice X here for compatibility with the rest of the tests - # (the first X is the original dataset and the second X represent the - # tuples given to the models (which are in fact the original dataset here) - return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) - - -def build_regression(preprocessor): - # builds a toy regression problem - X, y = shuffle(*make_regression(n_samples=100, n_features=10), - random_state=RNG) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RNG) - # we will return twice X here for compatibility with the rest of the tests - # (the first X is the original dataset and the second X represent the - # tuples given to the models (which are in fact the original dataset here) - return (X, X, y, X_train, X_test, y_train, y_test, preprocessor) - - -def build_pairs(preprocessor): - # builds a toy pairs problem - X, indices = build_data() - if preprocessor is not None: - # if preprocessor, we build a 2D array of pairs of indices - _, y = wrap_pairs(X, indices) - pairs = np.vstack([np.column_stack(indices[:2]), - np.column_stack(indices[2:])]) - else: - # if not, we build a 3D array of pairs of samples - pairs, y = wrap_pairs(X, indices) - pairs, y = shuffle(pairs, y, random_state=RNG) - (pairs_train, pairs_test, y_train, - y_test) = train_test_split(pairs, y, random_state=RNG) - return (X, pairs, y, pairs_train, pairs_test, - y_train, y_test, preprocessor) - - -def build_quadruplets(preprocessor): - # builds a toy quadruplets problem - X, indices = build_data() - c = np.column_stack(indices) - if preprocessor is not None: - # if preprocessor, we build a 2D array of quadruplets of indices - quadruplets = c - else: - # if not, we build a 3D array of quadruplets of samples - quadruplets = X[c] - quadruplets = shuffle(quadruplets, random_state=RNG) - y = y_train = y_test = None - quadruplets_train, quadruplets_test = train_test_split(quadruplets, - random_state=RNG) - return (X, quadruplets, y, quadruplets_train, quadruplets_test, - y_train, y_test, preprocessor) - - -list_estimators = [(Covariance(), build_classification), - (ITML(), build_pairs), - (LFDA(), build_classification), - (LMNN(), build_classification), - (LSML(), build_quadruplets), - (MLKR(), build_regression), - (MMC(max_iter=2), build_pairs), # max_iter=2 for faster - # testing - (NCA(), build_classification), - (RCA(), build_classification), - (SDML(), build_pairs), - (ITML_Supervised(), build_classification), - (LSML_Supervised(), build_classification), - (MMC_Supervised(), build_classification), - (RCA_Supervised(num_chunks=10), build_classification), - (SDML_Supervised(), build_classification) - ] - -ids_estimators = ['covariance', - 'itml', - 'lfda', - 'lmnn', - 'lsml', - 'mlkr', - 'mmc', - 'nca', - 'rca', - 'sdml', - 'itml_supervised', - 'lsml_supervised', - 'mmc_supervised', - 'rca_supervised', - 'sdml_supervised' - ] - - -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_cross_validation_is_finite(estimator, build_dataset, preprocessor): """Tests that validation on metric-learn estimators returns something finite """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -210,7 +104,7 @@ def test_cross_validation_is_finite(estimator, build_dataset, preprocessor): assert np.isfinite(cross_val_predict(estimator, tuples, y)).all() -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_cross_validation_manual_vs_scikit(estimator, build_dataset, @@ -220,8 +114,7 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset, folds is taken from scikit-learn). """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -238,8 +131,7 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset, test_slice = slice(start, stop) train_mask = np.ones(tuples.shape[0], bool) train_mask[test_slice] = False - (y_train, y_test) = ((y[train_mask], y[test_slice]) if y is not None - else (None, None)) + y_train, y_test = y[train_mask], y[test_slice] estimator.fit(tuples[train_mask], y_train) if hasattr(estimator, "score"): scores.append(estimator.score(tuples[test_slice], y_test)) @@ -264,15 +156,16 @@ def check_predict(estimator, tuples): assert len(y_predicted), len(tuples) -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_simple_estimator(estimator, build_dataset, preprocessor): """Tests that fit, predict and scoring works. """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(preprocessor) + (tuples_train, tuples_test, y_train, + y_test) = train_test_split(tuples, y, random_state=RNG) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -284,7 +177,7 @@ def test_simple_estimator(estimator, build_dataset, preprocessor): @pytest.mark.parametrize('estimator', [est[0] for est in list_estimators], ids=ids_estimators) -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor', [None, mock_preprocessor]) def test_no_attributes_set_in_init(estimator, preprocessor): """Check setting during init. Adapted from scikit-learn.""" estimator = clone(estimator) @@ -314,26 +207,25 @@ def test_no_attributes_set_in_init(estimator, preprocessor): "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_estimators_fit_returns_self(estimator, build_dataset, preprocessor): """Check if self is returned when calling fit""" # Adapted from scikit-learn - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) assert estimator.fit(tuples, y) is estimator -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_pipeline_consistency(estimator, build_dataset, preprocessor): # Adapted from scikit learn # check that make_pipeline(est) gives same score as est - (_, input_data, y, _, _, _, _, preprocessor) = build_dataset(preprocessor) + input_data, y, preprocessor, _ = build_dataset(preprocessor) def make_random_state(estimator, in_pipeline): rs = {} @@ -368,13 +260,12 @@ def make_random_state(estimator, in_pipeline): assert_allclose_dense_sparse(result, result_pipe) -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor',[True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_dict_unchanged(estimator, build_dataset, preprocessor): # Adapted from scikit-learn - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) + tuples, y, preprocessor, to_transform = build_dataset(preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): @@ -391,19 +282,18 @@ def check_dict(): check_dict() if hasattr(estimator, "transform"): dict_before = estimator.__dict__.copy() - # we transform only 2D arrays (dataset of points) - estimator.transform(X) + # we transform only dataset of points + estimator.transform(to_transform) check_dict() -@pytest.mark.parametrize('preprocessor', [None, build_data()[0]]) +@pytest.mark.parametrize('preprocessor',[True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): # Adapted from scikit-learn # check that fit method only changes or sets private attributes - (X, tuples, y, tuples_train, tuples_test, - y_train, y_test, preprocessor) = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): diff --git a/test/test_utils.py b/test/test_utils.py index ee160097..4b7326c1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -11,11 +11,133 @@ check_collapsed_pairs) from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, - MMC_Supervised, RCA_Supervised, SDML_Supervised) -from metric_learn.base_metric import ArrayIndexer, MahalanobisMixin + MMC_Supervised, RCA_Supervised, SDML_Supervised, + Constraints) +from metric_learn.base_metric import (ArrayIndexer, MahalanobisMixin, + _PairsClassifierMixin, + _QuadrupletsClassifierMixin) from metric_learn.exceptions import PreprocessorError -from sklearn.datasets import make_regression, make_blobs -from .test_sklearn_compat import build_pairs, build_quadruplets +from sklearn.datasets import make_regression, make_blobs, load_iris + + +SEED = 42 +RNG = check_random_state(SEED) + +Dataset = namedtuple('Dataset', ('data target preprocessor to_transform')) +# Data and target are what we will fit on. Preprocessor is the additional +# data if we use a preprocessor (which should be the default ArrayIndexer), +# and to_transform is some additional data that we would want to transform + + +@pytest.fixture +def build_classification(preprocessor=False): + """Basic array for testing when using a preprocessor""" + X, y = shuffle(*make_blobs(random_state=SEED), + random_state=SEED) + indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) + if preprocessor: + return Dataset(indices, y[indices], X, indices) + else: + return Dataset(X[indices], y[indices], None, X[indices]) + + +@pytest.fixture +def build_regression(preprocessor=False): + """Basic array for testing when using a preprocessor""" + X, y = shuffle(*make_regression(n_samples=100, n_features=5, + random_state=SEED), + random_state=SEED) + indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) + if preprocessor: + return Dataset(indices, y[indices], X, indices) + else: + return Dataset(X[indices], y[indices], None, X[indices]) + + +def build_data(): + input_data, labels = load_iris(return_X_y=True) + X, y = shuffle(input_data, labels, random_state=SEED) + num_constraints = 50 + constraints = ( + Constraints.random_subset(y, random_state=check_random_state(SEED))) + pairs = ( + constraints + .positive_negative_pairs(num_constraints, same_length=True, + random_state=check_random_state(SEED))) + return X, pairs + + +def build_pairs(preprocessor=False): + # builds a toy pairs problem + X, indices = build_data() + c = np.vstack([np.column_stack(indices[:2]), np.column_stack(indices[2:])]) + target = np.concatenate([np.ones(indices[0].shape[0]), + - np.ones(indices[0].shape[0])]) + if preprocessor: + # if preprocessor, we build a 2D array of pairs of indices + return Dataset(*shuffle(c, target, random_state=SEED), X, c[:, 0]) + else: + # if not, we build a 3D array of pairs of samples + return Dataset(*shuffle(X[c], target, random_state=SEED), None, X[c[:, 0]]) + + +def build_quadruplets(preprocessor=False): + # builds a toy quadruplets problem + X, indices = build_data() + c = np.column_stack(indices) + target = np.ones(c.shape[0]) # quadruplets targets are not used + # anyways + if preprocessor: + # if preprocessor, we build a 2D array of quadruplets of indices + return Dataset(*shuffle(c, target, random_state=SEED), X, c[:, 0]) + else: + # if not, we build a 3D array of quadruplets of samples + return Dataset(*shuffle(X[c], target, random_state=SEED), None, X[c[:, 0]]) + + +quadruplets_learners = [(LSML(), build_quadruplets)] +ids_quadruplets_learners = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in + quadruplets_learners])) + +pairs_learners = [(ITML(), build_pairs), + (MMC(max_iter=2), build_pairs), # max_iter=2 for faster + (SDML(), build_pairs), + ] +ids_pairs_learners = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in + pairs_learners])) + +classifiers = [(Covariance(), build_classification), + (LFDA(), build_classification), + (LMNN(), build_classification), + (NCA(), build_classification), + (RCA(), build_classification), + (ITML_Supervised(max_iter=5), build_classification), + (LSML_Supervised(), build_classification), + (MMC_Supervised(max_iter=5), build_classification), + (RCA_Supervised(num_chunks=10), build_classification), + (SDML_Supervised(), build_classification) + ] +ids_classifiers = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in + classifiers])) + +regressors = [(MLKR(), build_regression)] +ids_regressors = list(map(lambda x: x.__class__.__name__, + [learner for (learner, _) in regressors])) + +WeaklySupervisedClasses = (_PairsClassifierMixin, + _QuadrupletsClassifierMixin) + +tuples_learners = pairs_learners + quadruplets_learners +ids_tuples_learners = ids_pairs_learners + ids_quadruplets_learners + +supervised_learners = classifiers + regressors +ids_supervised_learners = ids_classifiers + ids_regressors + +list_estimators = tuples_learners + supervised_learners +ids_estimators = ids_tuples_learners + ids_supervised_learners def mock_preprocessor(indices): @@ -762,17 +884,9 @@ def test_error_message_tuple_size(estimator): assert str(raised_err.value) == expected_msg -@pytest.mark.parametrize('estimator', [ITML(), LSML(), MMC(), RCA(), SDML(), - Covariance(), LFDA(), LMNN(), MLKR(), - NCA(), ITML_Supervised(), - LSML_Supervised(), MMC_Supervised(), - RCA_Supervised(), SDML_Supervised()], - ids=['ITML', 'LSML', 'MMC', 'RCA', 'SDML', - 'Covariance', 'LFDA', 'LMNN', 'MLKR', 'NCA', - 'ITML_Supervised', 'LSML_Supervised', - 'MMC_Supervised', 'RCA_Supervised', - 'SDML_Supervised']) -def test_error_message_t_score_pairs(estimator): +@pytest.mark.parametrize('estimator, _', list_estimators, + ids=ids_estimators) +def test_error_message_t_score_pairs(estimator, _): """tests that if you want to score_pairs on triplets for instance, it returns the right error message """ @@ -826,95 +940,54 @@ def fun(row): # with their no-preprocessor equivalent -Dataset = namedtuple('Dataset', 'formed_points points_indicators labels data') - - -@pytest.fixture -def build_classification(rng): - """Basic array for testing when using a preprocessor""" - X, y = shuffle(*make_blobs(random_state=rng), - random_state=rng) - indices = shuffle(np.arange(X.shape[0]), random_state=rng) - indices = indices.astype(int) - return Dataset(X[indices], indices, y, X) - - -@pytest.fixture -def build_regression(rng): - """Basic array for testing when using a preprocessor""" - X, y = shuffle(*make_regression(n_samples=100, n_features=5, - random_state=rng), - random_state=rng) - indices = shuffle(np.arange(X.shape[0]), random_state=rng) - indices = indices.astype(int) - return Dataset(X[indices], indices, y, X) - - -RNG = check_random_state(0) - -classifiers = [Covariance(), - LFDA(), - LMNN(), - NCA(), - RCA(), - ITML_Supervised(max_iter=5), - LSML_Supervised(), - MMC_Supervised(max_iter=5), - RCA_Supervised(num_chunks=10), # less chunks because we only - # have a few data in the test - SDML_Supervised()] - -regressors = [MLKR()] - -estimators = [(classifier, build_classification(RNG)) for classifier in - classifiers] -estimators += [(regressor, build_regression(RNG)) for regressor in - regressors] - -ids_estimators = list(map(lambda x: x.__class__.__name__, classifiers + - regressors)) - - -@pytest.mark.parametrize('estimator, dataset', estimators, - ids=ids_estimators) -def test_same_with_or_without_preprocessor_classic(estimator, dataset): +@pytest.mark.parametrize('estimator, build_dataset', supervised_learners, + ids=ids_supervised_learners) +def test_same_with_or_without_preprocessor_classic(estimator, build_dataset): """Test that supervised algorithms using a preprocessor behave consistently with their no-preprocessor equivalent. """ - (formed_points_train, formed_points_test, - y_train, y_test, points_indicators_train, - points_indicators_test) = train_test_split(dataset.formed_points, - dataset.labels, - dataset.points_indicators, - random_state=RNG) + dataset_with_preprocessor = build_dataset(preprocessor=True) + dataset_without_preprocessor = build_dataset(preprocessor=False) + preprocessor = dataset_with_preprocessor.preprocessor + (points_indicators_train, points_indicators_test, y_train, + y_test, formed_points_train, + formed_points_test) = train_test_split( + dataset_with_preprocessor.data, + dataset_with_preprocessor.target, + dataset_without_preprocessor.data, + random_state=SEED) + formed_points_to_transform = dataset_without_preprocessor.to_transform + points_indicators_to_transform = dataset_with_preprocessor.to_transform def make_random_state(estimator): rs = {} if estimator.__class__.__name__[-11:] == '_Supervised': - rs['random_state'] = check_random_state(0) + rs['random_state'] = check_random_state(SEED) return rs estimator_without_prep = clone(estimator) - set_random_state(estimator_without_prep) + set_random_state(estimator_without_prep, SEED) estimator_without_prep.set_params(preprocessor=None) estimator_without_prep.fit(formed_points_train, y_train, **make_random_state(estimator)) - embedding_without_prep = estimator_without_prep.transform(formed_points_test) + embedding_without_prep = estimator_without_prep.transform( + formed_points_to_transform) estimator_with_prep = clone(estimator) - set_random_state(estimator_with_prep) - estimator_with_prep.set_params(preprocessor=dataset.data) + set_random_state(estimator_with_prep, SEED) + estimator_with_prep.set_params(preprocessor=preprocessor) estimator_with_prep.fit(points_indicators_train, y_train, **make_random_state(estimator)) - embedding_with_prep = estimator_with_prep.transform(points_indicators_test) + embedding_with_prep = estimator_with_prep.transform( + points_indicators_to_transform) estimator_with_prep_formed = clone(estimator) - set_random_state(estimator_with_prep_formed) - estimator_with_prep_formed.set_params(preprocessor=dataset.data) + set_random_state(estimator_with_prep_formed, SEED) + estimator_with_prep_formed.set_params(preprocessor=preprocessor) estimator_with_prep_formed.fit(formed_points_train, y_train, **make_random_state(estimator)) embedding_with_prep_formed = estimator_with_prep_formed.transform( - formed_points_test) + formed_points_to_transform) # test transform assert (embedding_with_prep == embedding_without_prep).all() @@ -922,31 +995,30 @@ def make_random_state(estimator): # test score_pairs assert (estimator_without_prep.score_pairs( - formed_points_test[np.array([[0, 2], [5, 3]])]) == + formed_points_to_transform[np.array([[0, 2], [5, 3]])]) == estimator_with_prep.score_pairs( - points_indicators_test[np.array([[0, 2], [5, 3]])])).all() + (points_indicators_to_transform)[np.array([[0, 2], [5, 3]])])).all() assert ( estimator_with_prep.score_pairs( - points_indicators_test[np.array([[0, 2], [5, 3]])]) == + (points_indicators_to_transform)[np.array([[0, 2], [5, 3]])]) == estimator_with_prep_formed.score_pairs( - formed_points_test[np.array([[0, 2], [5, 3]])])).all() + (formed_points_to_transform)[np.array([[0, 2], [5, 3]])])).all() -@pytest.mark.parametrize('estimator, build_dataset', - [(ITML(), build_pairs), - (LSML(), build_quadruplets), - (MMC(max_iter=2), build_pairs), - (SDML(), build_pairs)], - ids=['itml', 'lsml', 'mmc', 'sdml']) +@pytest.mark.parametrize('estimator, build_dataset', tuples_learners, + ids=ids_tuples_learners) def test_same_with_or_without_preprocessor_tuples(estimator, build_dataset): """For weakly supervised algorithms, test that using a preprocessor or not (with the appropriate corresponding inputs) give the same result. """ - (X, tuples, y, tuples_train, tuples_test, y_train, - y_test, _) = build_dataset(preprocessor=mock_preprocessor) - formed_tuples_train = X[tuples_train] - formed_tuples_test = X[tuples_test] + dataset = build_dataset(preprocessor=True) + dataset_formed = build_dataset(preprocessor=False) + X = dataset.preprocessor + (tuples_train, tuples_test, y_train, y_test, formed_tuples_train, + formed_tuples_test) = train_test_split(dataset.data, dataset.target, + dataset_formed.data, + random_state=SEED) estimator_with_preprocessor = clone(estimator) set_random_state(estimator_with_preprocessor) From c0cc882a89fb18b0de486b9545f7d3dea15b2560 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 14:32:39 +0100 Subject: [PATCH 111/120] MAINT: refactor bool preprocessor to with_preprocessor --- test/test_sklearn_compat.py | 46 ++++++++++++++++++++----------------- test/test_utils.py | 16 ++++++------- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 89100054..ae8d91b5 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -87,14 +87,15 @@ def test_mmc(self): # ---------------------- Test scikit-learn compatibility ---------------------- -@pytest.mark.parametrize('preprocessor', [True, False]) +@pytest.mark.parametrize('with_preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_cross_validation_is_finite(estimator, build_dataset, preprocessor): +def test_cross_validation_is_finite(estimator, build_dataset, + with_preprocessor): """Tests that validation on metric-learn estimators returns something finite """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - tuples, y, preprocessor, _ = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -104,17 +105,17 @@ def test_cross_validation_is_finite(estimator, build_dataset, preprocessor): assert np.isfinite(cross_val_predict(estimator, tuples, y)).all() -@pytest.mark.parametrize('preprocessor', [True, False]) +@pytest.mark.parametrize('with_preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) def test_cross_validation_manual_vs_scikit(estimator, build_dataset, - preprocessor): + with_preprocessor): """Tests that if we make a manual cross-validation, the result will be the same as scikit-learn's cross-validation (some code for generating the folds is taken from scikit-learn). """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - tuples, y, preprocessor, _ = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -156,14 +157,14 @@ def check_predict(estimator, tuples): assert len(y_predicted), len(tuples) -@pytest.mark.parametrize('preprocessor', [True, False]) +@pytest.mark.parametrize('with_preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_simple_estimator(estimator, build_dataset, preprocessor): +def test_simple_estimator(estimator, build_dataset, with_preprocessor): """Tests that fit, predict and scoring works. """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - tuples, y, preprocessor, _ = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(with_preprocessor) (tuples_train, tuples_test, y_train, y_test) = train_test_split(tuples, y, random_state=RNG) estimator = clone(estimator) @@ -207,25 +208,27 @@ def test_no_attributes_set_in_init(estimator, preprocessor): "attributes %s." % (type(estimator).__name__, sorted(invalid_attr))) -@pytest.mark.parametrize('preprocessor', [True, False]) +@pytest.mark.parametrize('with_preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_estimators_fit_returns_self(estimator, build_dataset, preprocessor): +def test_estimators_fit_returns_self(estimator, build_dataset, + with_preprocessor): """Check if self is returned when calling fit""" # Adapted from scikit-learn - tuples, y, preprocessor, _ = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) assert estimator.fit(tuples, y) is estimator -@pytest.mark.parametrize('preprocessor', [True, False]) +@pytest.mark.parametrize('with_preprocessor', [True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_pipeline_consistency(estimator, build_dataset, preprocessor): +def test_pipeline_consistency(estimator, build_dataset, + with_preprocessor): # Adapted from scikit learn # check that make_pipeline(est) gives same score as est - input_data, y, preprocessor, _ = build_dataset(preprocessor) + input_data, y, preprocessor, _ = build_dataset(with_preprocessor) def make_random_state(estimator, in_pipeline): rs = {} @@ -260,12 +263,12 @@ def make_random_state(estimator, in_pipeline): assert_allclose_dense_sparse(result, result_pipe) -@pytest.mark.parametrize('preprocessor',[True, False]) +@pytest.mark.parametrize('with_preprocessor',[True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_dict_unchanged(estimator, build_dataset, preprocessor): +def test_dict_unchanged(estimator, build_dataset, with_preprocessor): # Adapted from scikit-learn - tuples, y, preprocessor, to_transform = build_dataset(preprocessor) + tuples, y, preprocessor, to_transform = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): @@ -287,13 +290,14 @@ def check_dict(): check_dict() -@pytest.mark.parametrize('preprocessor',[True, False]) +@pytest.mark.parametrize('with_preprocessor',[True, False]) @pytest.mark.parametrize('estimator, build_dataset', list_estimators, ids=ids_estimators) -def test_dont_overwrite_parameters(estimator, build_dataset, preprocessor): +def test_dont_overwrite_parameters(estimator, build_dataset, + with_preprocessor): # Adapted from scikit-learn # check that fit method only changes or sets private attributes - tuples, y, preprocessor, _ = build_dataset(preprocessor) + tuples, y, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): diff --git a/test/test_utils.py b/test/test_utils.py index 4b7326c1..728edc35 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -30,25 +30,25 @@ @pytest.fixture -def build_classification(preprocessor=False): +def build_classification(with_preprocessor=False): """Basic array for testing when using a preprocessor""" X, y = shuffle(*make_blobs(random_state=SEED), random_state=SEED) indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) - if preprocessor: + if with_preprocessor: return Dataset(indices, y[indices], X, indices) else: return Dataset(X[indices], y[indices], None, X[indices]) @pytest.fixture -def build_regression(preprocessor=False): +def build_regression(with_preprocessor=False): """Basic array for testing when using a preprocessor""" X, y = shuffle(*make_regression(n_samples=100, n_features=5, random_state=SEED), random_state=SEED) indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) - if preprocessor: + if with_preprocessor: return Dataset(indices, y[indices], X, indices) else: return Dataset(X[indices], y[indices], None, X[indices]) @@ -67,13 +67,13 @@ def build_data(): return X, pairs -def build_pairs(preprocessor=False): +def build_pairs(with_preprocessor=False): # builds a toy pairs problem X, indices = build_data() c = np.vstack([np.column_stack(indices[:2]), np.column_stack(indices[2:])]) target = np.concatenate([np.ones(indices[0].shape[0]), - np.ones(indices[0].shape[0])]) - if preprocessor: + if with_preprocessor: # if preprocessor, we build a 2D array of pairs of indices return Dataset(*shuffle(c, target, random_state=SEED), X, c[:, 0]) else: @@ -81,13 +81,13 @@ def build_pairs(preprocessor=False): return Dataset(*shuffle(X[c], target, random_state=SEED), None, X[c[:, 0]]) -def build_quadruplets(preprocessor=False): +def build_quadruplets(with_preprocessor=False): # builds a toy quadruplets problem X, indices = build_data() c = np.column_stack(indices) target = np.ones(c.shape[0]) # quadruplets targets are not used # anyways - if preprocessor: + if with_preprocessor: # if preprocessor, we build a 2D array of quadruplets of indices return Dataset(*shuffle(c, target, random_state=SEED), X, c[:, 0]) else: From 0b3e58a339134d44e5b8d967c008e307b377ef24 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 14:35:41 +0100 Subject: [PATCH 112/120] FIX: fix build_pairs and build_quadruplets because 'only named arguments should follow expression' --- test/test_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 728edc35..d5f67461 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -75,10 +75,11 @@ def build_pairs(with_preprocessor=False): - np.ones(indices[0].shape[0])]) if with_preprocessor: # if preprocessor, we build a 2D array of pairs of indices - return Dataset(*shuffle(c, target, random_state=SEED), X, c[:, 0]) + return Dataset(*(*shuffle(c, target, random_state=SEED), X, c[:, 0])) else: # if not, we build a 3D array of pairs of samples - return Dataset(*shuffle(X[c], target, random_state=SEED), None, X[c[:, 0]]) + return Dataset(*(*shuffle(X[c], target, random_state=SEED), + None, X[c[:, 0]])) def build_quadruplets(with_preprocessor=False): @@ -89,10 +90,11 @@ def build_quadruplets(with_preprocessor=False): # anyways if with_preprocessor: # if preprocessor, we build a 2D array of quadruplets of indices - return Dataset(*shuffle(c, target, random_state=SEED), X, c[:, 0]) + return Dataset(*(*shuffle(c, target, random_state=SEED), X, c[:, 0])) else: # if not, we build a 3D array of quadruplets of samples - return Dataset(*shuffle(X[c], target, random_state=SEED), None, X[c[:, 0]]) + return Dataset(*(*shuffle(X[c], target, random_state=SEED), + None, X[c[:, 0]])) quadruplets_learners = [(LSML(), build_quadruplets)] From fbd72429ae533819d36e7988ecf3a24e4aba9484 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 14:37:57 +0100 Subject: [PATCH 113/120] STY: fix PEP8 error --- test/test_mahalanobis_mixin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index bcf3ec11..27c9cb6f 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -96,8 +96,8 @@ def check_is_distance_matrix(pairwise): assert (pairwise.diagonal() == 0).all() # identity # triangular inequality tol = 1e-15 - assert (pairwise <= pairwise[:, :, np.newaxis] - + pairwise[:, np.newaxis, :] + tol).all() + assert (pairwise <= pairwise[:, :, np.newaxis] + + pairwise[:, np.newaxis, :] + tol).all() @pytest.mark.parametrize('estimator, build_dataset', list_estimators, From 148012e0cbc1a3c317f58bae7577c7c5bca68d21 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 14:53:21 +0100 Subject: [PATCH 114/120] MAINT: mututalize test_same_with_or_without_preprocessor_tuples and test_same_with_or_without_preprocessor_classic --- test/test_utils.py | 114 +++++++++++---------------------------------- 1 file changed, 27 insertions(+), 87 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index d5f67461..6f44f112 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,29 +937,21 @@ def fun(row): assert (preprocess_points(array, fun) == expected_result).all() -# ---------------------------------------------------------------------------- -# test that supervised algorithms using a preprocessor behave consistently +@pytest.mark.parametrize('estimator, build_dataset', list_estimators, + ids=ids_estimators) +def test_same_with_or_without_preprocessor(estimator, build_dataset): + """Test that algorithms using a preprocessor behave consistently # with their no-preprocessor equivalent - - -@pytest.mark.parametrize('estimator, build_dataset', supervised_learners, - ids=ids_supervised_learners) -def test_same_with_or_without_preprocessor_classic(estimator, build_dataset): - """Test that supervised algorithms using a preprocessor behave consistently - with their no-preprocessor equivalent. """ - dataset_with_preprocessor = build_dataset(preprocessor=True) - dataset_without_preprocessor = build_dataset(preprocessor=False) - preprocessor = dataset_with_preprocessor.preprocessor - (points_indicators_train, points_indicators_test, y_train, - y_test, formed_points_train, - formed_points_test) = train_test_split( - dataset_with_preprocessor.data, - dataset_with_preprocessor.target, - dataset_without_preprocessor.data, - random_state=SEED) - formed_points_to_transform = dataset_without_preprocessor.to_transform - points_indicators_to_transform = dataset_with_preprocessor.to_transform + dataset = build_dataset(with_preprocessor=True) + dataset_formed = build_dataset(with_preprocessor=False) + X = dataset.preprocessor + indicators_to_transform = dataset.to_transform + formed_points_to_transform = dataset_formed.to_transform + (tuples_train, tuples_test, y_train, y_test, formed_tuples_train, + formed_tuples_test) = train_test_split(dataset.data, dataset.target, + dataset_formed.data, + random_state=SEED) def make_random_state(estimator): rs = {} @@ -967,75 +959,23 @@ def make_random_state(estimator): rs['random_state'] = check_random_state(SEED) return rs - estimator_without_prep = clone(estimator) - set_random_state(estimator_without_prep, SEED) - estimator_without_prep.set_params(preprocessor=None) - estimator_without_prep.fit(formed_points_train, y_train, - **make_random_state(estimator)) - embedding_without_prep = estimator_without_prep.transform( - formed_points_to_transform) - - estimator_with_prep = clone(estimator) - set_random_state(estimator_with_prep, SEED) - estimator_with_prep.set_params(preprocessor=preprocessor) - estimator_with_prep.fit(points_indicators_train, y_train, - **make_random_state(estimator)) - embedding_with_prep = estimator_with_prep.transform( - points_indicators_to_transform) - - estimator_with_prep_formed = clone(estimator) - set_random_state(estimator_with_prep_formed, SEED) - estimator_with_prep_formed.set_params(preprocessor=preprocessor) - estimator_with_prep_formed.fit(formed_points_train, y_train, - **make_random_state(estimator)) - embedding_with_prep_formed = estimator_with_prep_formed.transform( - formed_points_to_transform) - - # test transform - assert (embedding_with_prep == embedding_without_prep).all() - assert (embedding_with_prep == embedding_with_prep_formed).all() - - # test score_pairs - assert (estimator_without_prep.score_pairs( - formed_points_to_transform[np.array([[0, 2], [5, 3]])]) == - estimator_with_prep.score_pairs( - (points_indicators_to_transform)[np.array([[0, 2], [5, 3]])])).all() - - assert ( - estimator_with_prep.score_pairs( - (points_indicators_to_transform)[np.array([[0, 2], [5, 3]])]) == - estimator_with_prep_formed.score_pairs( - (formed_points_to_transform)[np.array([[0, 2], [5, 3]])])).all() - - -@pytest.mark.parametrize('estimator, build_dataset', tuples_learners, - ids=ids_tuples_learners) -def test_same_with_or_without_preprocessor_tuples(estimator, build_dataset): - """For weakly supervised algorithms, test that using a preprocessor or not - (with the appropriate corresponding inputs) give the same result. - """ - dataset = build_dataset(preprocessor=True) - dataset_formed = build_dataset(preprocessor=False) - X = dataset.preprocessor - (tuples_train, tuples_test, y_train, y_test, formed_tuples_train, - formed_tuples_test) = train_test_split(dataset.data, dataset.target, - dataset_formed.data, - random_state=SEED) - estimator_with_preprocessor = clone(estimator) set_random_state(estimator_with_preprocessor) estimator_with_preprocessor.set_params(preprocessor=X) - estimator_with_preprocessor.fit(tuples_train, y_train) + estimator_with_preprocessor.fit(tuples_train, y_train, + **make_random_state(estimator)) estimator_without_preprocessor = clone(estimator) set_random_state(estimator_without_preprocessor) estimator_without_preprocessor.set_params(preprocessor=None) - estimator_without_preprocessor.fit(formed_tuples_train, y_train) + estimator_without_preprocessor.fit(formed_tuples_train, y_train, + **make_random_state(estimator)) estimator_with_prep_formed = clone(estimator) set_random_state(estimator_with_prep_formed) estimator_with_prep_formed.set_params(preprocessor=X) - estimator_with_prep_formed.fit(tuples_train, y_train) + estimator_with_prep_formed.fit(tuples_train, y_train, + **make_random_state(estimator)) # test prediction methods for method in ["predict", "decision_function"]: @@ -1053,28 +993,28 @@ def test_same_with_or_without_preprocessor_tuples(estimator, build_dataset): # test score_pairs output_with_prep = estimator_with_preprocessor.score_pairs( - tuples_test[:, :2]) + indicators_to_transform[[[[0, 2], [5, 3]]]]) output_without_prep = estimator_without_preprocessor.score_pairs( - formed_tuples_test[:, :2]) + formed_points_to_transform[[[[0, 2], [5, 3]]]]) assert np.array(output_with_prep == output_without_prep).all() output_with_prep = estimator_with_preprocessor.score_pairs( - tuples_test[:, :2]) + indicators_to_transform[[[[0, 2], [5, 3]]]]) output_without_prep = estimator_with_prep_formed.score_pairs( - formed_tuples_test[:, :2]) + formed_points_to_transform[[[[0, 2], [5, 3]]]]) assert np.array(output_with_prep == output_without_prep).all() # test transform output_with_prep = estimator_with_preprocessor.transform( - tuples_test[:, 0]) + indicators_to_transform) output_without_prep = estimator_without_preprocessor.transform( - formed_tuples_test[:, 0]) + formed_points_to_transform) assert np.array(output_with_prep == output_without_prep).all() output_with_prep = estimator_with_preprocessor.transform( - tuples_test[:, 0]) + indicators_to_transform) output_without_prep = estimator_with_prep_formed.transform( - formed_tuples_test[:, 0]) + formed_points_to_transform) assert np.array(output_with_prep == output_without_prep).all() From 8c5675b34851a4bc1ba6530fdf694fdc8a9a6dd5 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 14:55:24 +0100 Subject: [PATCH 115/120] TST: give better names in test_same_with_or_without_preprocessor --- test/test_utils.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 6f44f112..e2b7131e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -943,13 +943,14 @@ def test_same_with_or_without_preprocessor(estimator, build_dataset): """Test that algorithms using a preprocessor behave consistently # with their no-preprocessor equivalent """ - dataset = build_dataset(with_preprocessor=True) + dataset_indices = build_dataset(with_preprocessor=True) dataset_formed = build_dataset(with_preprocessor=False) - X = dataset.preprocessor - indicators_to_transform = dataset.to_transform + X = dataset_indices.preprocessor + indicators_to_transform = dataset_indices.to_transform formed_points_to_transform = dataset_formed.to_transform - (tuples_train, tuples_test, y_train, y_test, formed_tuples_train, - formed_tuples_test) = train_test_split(dataset.data, dataset.target, + (indices_train, indices_test, y_train, y_test, formed_train, + formed_test) = train_test_split(dataset_indices.data, + dataset_indices.target, dataset_formed.data, random_state=SEED) @@ -962,33 +963,33 @@ def make_random_state(estimator): estimator_with_preprocessor = clone(estimator) set_random_state(estimator_with_preprocessor) estimator_with_preprocessor.set_params(preprocessor=X) - estimator_with_preprocessor.fit(tuples_train, y_train, + estimator_with_preprocessor.fit(indices_train, y_train, **make_random_state(estimator)) estimator_without_preprocessor = clone(estimator) set_random_state(estimator_without_preprocessor) estimator_without_preprocessor.set_params(preprocessor=None) - estimator_without_preprocessor.fit(formed_tuples_train, y_train, + estimator_without_preprocessor.fit(formed_train, y_train, **make_random_state(estimator)) estimator_with_prep_formed = clone(estimator) set_random_state(estimator_with_prep_formed) estimator_with_prep_formed.set_params(preprocessor=X) - estimator_with_prep_formed.fit(tuples_train, y_train, + estimator_with_prep_formed.fit(indices_train, y_train, **make_random_state(estimator)) # test prediction methods for method in ["predict", "decision_function"]: if hasattr(estimator, method): output_with_prep = getattr(estimator_with_preprocessor, - method)(tuples_test) + method)(indices_test) output_without_prep = getattr(estimator_without_preprocessor, - method)(formed_tuples_test) + method)(formed_test) assert np.array(output_with_prep == output_without_prep).all() output_with_prep = getattr(estimator_with_preprocessor, - method)(tuples_test) + method)(indices_test) output_with_prep_formed = getattr(estimator_with_prep_formed, - method)(formed_tuples_test) + method)(formed_test) assert np.array(output_with_prep == output_with_prep_formed).all() # test score_pairs From 30061e40f55d24bc9edbdd150206b1b705ea8f6a Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 14:57:49 +0100 Subject: [PATCH 116/120] MAINT: refactor list_estimators into metric_learners --- test/test_mahalanobis_mixin.py | 34 +++++++++++++++++----------------- test/test_sklearn_compat.py | 34 +++++++++++++++++----------------- test/test_utils.py | 12 ++++++------ 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 27c9cb6f..0d834f10 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -10,13 +10,13 @@ from metric_learn._util import make_context -from test.test_utils import ids_estimators, list_estimators +from test.test_utils import ids_metric_learners, metric_learners RNG = check_random_state(0) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_score_pairs_pairwise(estimator, build_dataset): # Computing pairwise scores should return a euclidean distance matrix. input_data, labels, _, X = build_dataset() @@ -39,8 +39,8 @@ def test_score_pairs_pairwise(estimator, build_dataset): assert_array_almost_equal(squareform(pairwise), pdist(model.transform(X))) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_score_pairs_toy_example(estimator, build_dataset): # Checks that score_pairs works on a toy example input_data, labels, _, X = build_dataset() @@ -57,8 +57,8 @@ def test_score_pairs_toy_example(estimator, build_dataset): assert_array_almost_equal(model.score_pairs(pairs), distances) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_score_pairs_finite(estimator, build_dataset): # tests that the score is finite input_data, labels, _, X = build_dataset() @@ -69,8 +69,8 @@ def test_score_pairs_finite(estimator, build_dataset): assert np.isfinite(model.score_pairs(pairs)).all() -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_score_pairs_dim(estimator, build_dataset): # scoring of 3D arrays should return 1D array (several tuples), # and scoring of 2D arrays (one tuple) should return an error (like @@ -100,8 +100,8 @@ def check_is_distance_matrix(pairwise): pairwise[:, np.newaxis, :] + tol).all() -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_embed_toy_example(estimator, build_dataset): # Checks that embed works on a toy example input_data, labels, _, X = build_dataset() @@ -114,8 +114,8 @@ def test_embed_toy_example(estimator, build_dataset): assert_array_almost_equal(model.transform(X), embedded_points) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_embed_dim(estimator, build_dataset): # Checks that the the dimension of the output space is as expected input_data, labels, _, X = build_dataset() @@ -143,8 +143,8 @@ def test_embed_dim(estimator, build_dataset): assert str(raised_error.value) == err_msg -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_embed_finite(estimator, build_dataset): # Checks that embed returns vectors with finite values input_data, labels, _, X = build_dataset() @@ -154,8 +154,8 @@ def test_embed_finite(estimator, build_dataset): assert np.isfinite(model.transform(X)).all() -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_embed_is_linear(estimator, build_dataset): # Checks that the embedding is linear input_data, labels, _, X = build_dataset() diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index ae8d91b5..7319935f 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -16,7 +16,7 @@ from sklearn.model_selection import (cross_val_score, cross_val_predict, train_test_split, KFold) from sklearn.utils.testing import _get_args -from test.test_utils import (list_estimators, ids_estimators, +from test.test_utils import (metric_learners, ids_metric_learners, mock_preprocessor) @@ -88,8 +88,8 @@ def test_mmc(self): @pytest.mark.parametrize('with_preprocessor', [True, False]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_cross_validation_is_finite(estimator, build_dataset, with_preprocessor): """Tests that validation on metric-learn estimators returns something finite @@ -106,8 +106,8 @@ def test_cross_validation_is_finite(estimator, build_dataset, @pytest.mark.parametrize('with_preprocessor', [True, False]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_cross_validation_manual_vs_scikit(estimator, build_dataset, with_preprocessor): """Tests that if we make a manual cross-validation, the result will be the @@ -158,8 +158,8 @@ def check_predict(estimator, tuples): @pytest.mark.parametrize('with_preprocessor', [True, False]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_simple_estimator(estimator, build_dataset, with_preprocessor): """Tests that fit, predict and scoring works. """ @@ -176,8 +176,8 @@ def test_simple_estimator(estimator, build_dataset, with_preprocessor): check_predict(estimator, tuples_test) -@pytest.mark.parametrize('estimator', [est[0] for est in list_estimators], - ids=ids_estimators) +@pytest.mark.parametrize('estimator', [est[0] for est in metric_learners], + ids=ids_metric_learners) @pytest.mark.parametrize('preprocessor', [None, mock_preprocessor]) def test_no_attributes_set_in_init(estimator, preprocessor): """Check setting during init. Adapted from scikit-learn.""" @@ -209,8 +209,8 @@ def test_no_attributes_set_in_init(estimator, preprocessor): @pytest.mark.parametrize('with_preprocessor', [True, False]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_estimators_fit_returns_self(estimator, build_dataset, with_preprocessor): """Check if self is returned when calling fit""" @@ -222,8 +222,8 @@ def test_estimators_fit_returns_self(estimator, build_dataset, @pytest.mark.parametrize('with_preprocessor', [True, False]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_pipeline_consistency(estimator, build_dataset, with_preprocessor): # Adapted from scikit learn @@ -264,8 +264,8 @@ def make_random_state(estimator, in_pipeline): @pytest.mark.parametrize('with_preprocessor',[True, False]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_dict_unchanged(estimator, build_dataset, with_preprocessor): # Adapted from scikit-learn tuples, y, preprocessor, to_transform = build_dataset(with_preprocessor) @@ -291,8 +291,8 @@ def check_dict(): @pytest.mark.parametrize('with_preprocessor',[True, False]) -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_dont_overwrite_parameters(estimator, build_dataset, with_preprocessor): # Adapted from scikit-learn diff --git a/test/test_utils.py b/test/test_utils.py index e2b7131e..6e28807c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -138,8 +138,8 @@ def build_quadruplets(with_preprocessor=False): supervised_learners = classifiers + regressors ids_supervised_learners = ids_classifiers + ids_regressors -list_estimators = tuples_learners + supervised_learners -ids_estimators = ids_tuples_learners + ids_supervised_learners +metric_learners = tuples_learners + supervised_learners +ids_metric_learners = ids_tuples_learners + ids_supervised_learners def mock_preprocessor(indices): @@ -886,8 +886,8 @@ def test_error_message_tuple_size(estimator): assert str(raised_err.value) == expected_msg -@pytest.mark.parametrize('estimator, _', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, _', metric_learners, + ids=ids_metric_learners) def test_error_message_t_score_pairs(estimator, _): """tests that if you want to score_pairs on triplets for instance, it returns the right error message @@ -937,8 +937,8 @@ def fun(row): assert (preprocess_points(array, fun) == expected_result).all() -@pytest.mark.parametrize('estimator, build_dataset', list_estimators, - ids=ids_estimators) +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) def test_same_with_or_without_preprocessor(estimator, build_dataset): """Test that algorithms using a preprocessor behave consistently # with their no-preprocessor equivalent From e241c287e0ced9220f84c924fd568a0de328247b Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 15:03:56 +0100 Subject: [PATCH 117/120] TST: uniformize names input_data - tuples, labels - y --- test/test_sklearn_compat.py | 48 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 7319935f..d9dce685 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -95,14 +95,15 @@ def test_cross_validation_is_finite(estimator, build_dataset, """Tests that validation on metric-learn estimators returns something finite """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - tuples, y, preprocessor, _ = build_dataset(with_preprocessor) + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) if hasattr(estimator, "score"): - assert np.isfinite(cross_val_score(estimator, tuples, y)).all() + assert np.isfinite(cross_val_score(estimator, input_data, labels)).all() if hasattr(estimator, "predict"): - assert np.isfinite(cross_val_predict(estimator, tuples, y)).all() + assert np.isfinite(cross_val_predict(estimator, + input_data, labels)).all() @pytest.mark.parametrize('with_preprocessor', [True, False]) @@ -115,33 +116,35 @@ def test_cross_validation_manual_vs_scikit(estimator, build_dataset, folds is taken from scikit-learn). """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - tuples, y, preprocessor, _ = build_dataset(with_preprocessor) + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) n_splits = 3 kfold = KFold(shuffle=False, n_splits=n_splits) - n_samples = tuples.shape[0] + n_samples = input_data.shape[0] fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int) fold_sizes[:n_samples % n_splits] += 1 current = 0 - scores, predictions = [], np.zeros(tuples.shape[0]) + scores, predictions = [], np.zeros(input_data.shape[0]) for fold_size in fold_sizes: start, stop = current, current + fold_size current = stop test_slice = slice(start, stop) - train_mask = np.ones(tuples.shape[0], bool) + train_mask = np.ones(input_data.shape[0], bool) train_mask[test_slice] = False - y_train, y_test = y[train_mask], y[test_slice] - estimator.fit(tuples[train_mask], y_train) + y_train, y_test = labels[train_mask], labels[test_slice] + estimator.fit(input_data[train_mask], y_train) if hasattr(estimator, "score"): - scores.append(estimator.score(tuples[test_slice], y_test)) + scores.append(estimator.score(input_data[test_slice], y_test)) if hasattr(estimator, "predict"): - predictions[test_slice] = estimator.predict(tuples[test_slice]) + predictions[test_slice] = estimator.predict(input_data[test_slice]) if hasattr(estimator, "score"): - assert all(scores == cross_val_score(estimator, tuples, y, cv=kfold)) + assert all(scores == cross_val_score(estimator, input_data, labels, + cv=kfold)) if hasattr(estimator, "predict"): - assert all(predictions == cross_val_predict(estimator, tuples, y, + assert all(predictions == cross_val_predict(estimator, input_data, + labels, cv=kfold)) @@ -164,9 +167,9 @@ def test_simple_estimator(estimator, build_dataset, with_preprocessor): """Tests that fit, predict and scoring works. """ if any(hasattr(estimator, method) for method in ["predict", "score"]): - tuples, y, preprocessor, _ = build_dataset(with_preprocessor) + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) (tuples_train, tuples_test, y_train, - y_test) = train_test_split(tuples, y, random_state=RNG) + y_test) = train_test_split(input_data, labels, random_state=RNG) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) @@ -215,10 +218,10 @@ def test_estimators_fit_returns_self(estimator, build_dataset, with_preprocessor): """Check if self is returned when calling fit""" # Adapted from scikit-learn - tuples, y, preprocessor, _ = build_dataset(with_preprocessor) + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) - assert estimator.fit(tuples, y) is estimator + assert estimator.fit(input_data, labels) is estimator @pytest.mark.parametrize('with_preprocessor', [True, False]) @@ -268,12 +271,13 @@ def make_random_state(estimator, in_pipeline): ids=ids_metric_learners) def test_dict_unchanged(estimator, build_dataset, with_preprocessor): # Adapted from scikit-learn - tuples, y, preprocessor, to_transform = build_dataset(with_preprocessor) + (input_data, labels, preprocessor, + to_transform) = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): estimator.num_dims = 1 - estimator.fit(tuples, y) + estimator.fit(input_data, labels) def check_dict(): assert estimator.__dict__ == dict_before, ( @@ -281,7 +285,7 @@ def check_dict(): for method in ["predict", "decision_function", "predict_proba"]: if hasattr(estimator, method): dict_before = estimator.__dict__.copy() - getattr(estimator, method)(tuples) + getattr(estimator, method)(input_data) check_dict() if hasattr(estimator, "transform"): dict_before = estimator.__dict__.copy() @@ -297,14 +301,14 @@ def test_dont_overwrite_parameters(estimator, build_dataset, with_preprocessor): # Adapted from scikit-learn # check that fit method only changes or sets private attributes - tuples, y, preprocessor, _ = build_dataset(with_preprocessor) + input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) if hasattr(estimator, "num_dims"): estimator.num_dims = 1 dict_before_fit = estimator.__dict__.copy() - estimator.fit(tuples, y) + estimator.fit(input_data, labels) dict_after_fit = estimator.__dict__ public_keys_after_fit = [key for key in dict_after_fit.keys() From b6d7de70f08e02e7fd7698396be57eb0a84e7677 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 5 Dec 2018 15:21:47 +0100 Subject: [PATCH 118/120] FIX: fix build_pairs and build_quadruplets --- test/test_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 6e28807c..149e22b7 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -73,13 +73,13 @@ def build_pairs(with_preprocessor=False): c = np.vstack([np.column_stack(indices[:2]), np.column_stack(indices[2:])]) target = np.concatenate([np.ones(indices[0].shape[0]), - np.ones(indices[0].shape[0])]) + c, target = shuffle(c, target, random_state=SEED) if with_preprocessor: # if preprocessor, we build a 2D array of pairs of indices - return Dataset(*(*shuffle(c, target, random_state=SEED), X, c[:, 0])) + return Dataset(c, target, X, c[:, 0]) else: # if not, we build a 3D array of pairs of samples - return Dataset(*(*shuffle(X[c], target, random_state=SEED), - None, X[c[:, 0]])) + return Dataset(X[c], target, None, X[c[:, 0]]) def build_quadruplets(with_preprocessor=False): @@ -88,13 +88,13 @@ def build_quadruplets(with_preprocessor=False): c = np.column_stack(indices) target = np.ones(c.shape[0]) # quadruplets targets are not used # anyways + c, target = shuffle(c, target, random_state=SEED) if with_preprocessor: # if preprocessor, we build a 2D array of quadruplets of indices - return Dataset(*(*shuffle(c, target, random_state=SEED), X, c[:, 0])) + return Dataset(c, target, X, c[:, 0]) else: # if not, we build a 3D array of quadruplets of samples - return Dataset(*(*shuffle(X[c], target, random_state=SEED), - None, X[c[:, 0]])) + return Dataset(X[c], target, None, X[c[:, 0]]) quadruplets_learners = [(LSML(), build_quadruplets)] From a44e29adf8f01e30f43ac2e1e7a2c2e6a5536820 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 11 Dec 2018 11:30:06 +0100 Subject: [PATCH 119/120] MAINT: remove forgotten code duplication --- metric_learn/mlkr.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/metric_learn/mlkr.py b/metric_learn/mlkr.py index 1839f533..8e8af9f2 100644 --- a/metric_learn/mlkr.py +++ b/metric_learn/mlkr.py @@ -61,27 +61,6 @@ def __init__(self, num_dims=None, A0=None, epsilon=0.01, alpha=0.0001, self.max_iter = max_iter super(MLKR, self).__init__(preprocessor) - def _process_inputs(self, X, y): - X, y = self._prepare_inputs(X, y, y_numeric=True, - ensure_min_samples=2) - n, d = self.X_.shape - if y.shape[0] != n: - raise ValueError('Data and label lengths mismatch: %d != %d' - % (n, y.shape[0])) - - A = self.A0 - m = self.num_dims - if m is None: - m = d - if A is None: - # initialize to PCA transformation matrix - # note: not the same as n_components=m ! - A = PCA().fit(X).components_.T[:m] - elif A.shape != (m, d): - raise ValueError('A0 needs shape (%d,%d) but got %s' % ( - m, d, A.shape)) - return X, y, A - def fit(self, X, y): """ Fit MLKR model From 2db6410194cc569b0e1b2fdbb919504a42a3f483 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 12 Dec 2018 16:16:05 +0100 Subject: [PATCH 120/120] MAINT: address https://github.com/metric-learn/metric-learn/pull/117#pullrequestreview-183991116 --- metric_learn/_util.py | 22 +++++++++------------- test/test_utils.py | 28 ---------------------------- 2 files changed, 9 insertions(+), 41 deletions(-) diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 55203fc8..27707be9 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -17,7 +17,7 @@ def vector_norm(X): def check_input(input_data, y=None, preprocessor=None, type_of_inputs='classic', tuple_size=None, accept_sparse=False, - dtype="numeric", order=None, + dtype='numeric', order=None, copy=False, force_all_finite=True, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, @@ -53,13 +53,11 @@ def check_input(input_data, y=None, preprocessor=None, tuple_size : int The number of elements in a tuple (e.g. 2 for pairs). - dtype : string, type, list of types or None (default="auto") + dtype : string, type, list of types or None (default='numeric') Data type of result. If None, the dtype of the input is preserved. - If "numeric", dtype is preserved unless array.dtype is object. + If 'numeric', dtype is preserved unless array.dtype is object. If dtype is a list of types, conversion on the first type is only - performed if the dtype of the input is not in the list. If - "auto", will we be set to "numeric" if `preprocessor=True`, - else to None. + performed if the dtype of the input is not in the list. order : 'F', 'C' or None (default=`None`) Whether an array will be forced to be fortran or c-style. @@ -173,9 +171,9 @@ def check_input_tuples(input_data, context, preprocessor, args_for_sk_checks, args_for_sk_checks['ensure_min_features'], context)) # normally we don't need to check_tuple_size too because tuple_size - # should'nt be able to be modified by any preprocessor - if input_data.ndim != 3: # we have to ensure this because check_array - # above does not + # shouldn't be able to be modified by any preprocessor + if input_data.ndim != 3: + # we have to ensure this because check_array above does not if preprocessor_has_been_applied: make_error_input(211, input_data, context) else: @@ -202,8 +200,8 @@ def check_input_classic(input_data, context, preprocessor, args_for_sk_checks): input_data = check_array(input_data, allow_nd=True, ensure_2d=False, **args_for_sk_checks) - if input_data.ndim != 2: # we have to ensure this because check_array - # above does not + if input_data.ndim != 2: + # we have to ensure this because check_array above does not if preprocessor_has_been_applied: make_error_input(111, input_data, context) else: @@ -241,7 +239,6 @@ def make_error_input(code, input_data, context): def preprocess_tuples(tuples, preprocessor): - print("Preprocessing tuples...") try: tuples = np.column_stack([preprocessor(tuples[:, i])[:, np.newaxis] for i in range(tuples.shape[1])]) @@ -253,7 +250,6 @@ def preprocess_tuples(tuples, preprocessor): def preprocess_points(points, preprocessor): """form points if there is a preprocessor else keep them as such (assumes that check_points has already been called)""" - print("Preprocessing points...") try: points = preprocessor(points) except Exception as e: diff --git a/test/test_utils.py b/test/test_utils.py index 149e22b7..de59e9ff 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -778,34 +778,6 @@ def preprocessor(sequence): assert str(raised_error.value) == expected_msg -def test_progress_message_preprocessor_points(capsys): - """Tests that when using a preprocessor on points, a message is printed - """ - points = np.array([1, 2, 4]) - - def fun(row): - return [[1, 1], [3, 3], [4, 4]] - - preprocess_points(points, preprocessor=fun) - out, _ = capsys.readouterr() - assert out == "Preprocessing points...\n" - - -def test_progress_message_preprocessor_tuples(capsys): - """Tests that when using a preprocessor on tuples, a message is printed - """ - tuples = np.array([[1, 2], - [2, 3], - [4, 5]]) - - def fun(row): - return np.array([[1, 1], [3, 3], [4, 4]]) - - preprocess_tuples(tuples, preprocessor=fun) - out, _ = capsys.readouterr() - assert out == "Preprocessing tuples...\n" - - def test_preprocessor_error_message(): """Tests whether the preprocessor returns a preprocessor error when there is a problem using the preprocessor