diff --git a/doc/conf.py b/doc/conf.py index ed476edd..f0faa2f8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -7,7 +7,8 @@ 'sphinx.ext.viewcode', 'sphinx.ext.mathjax', 'numpydoc', - 'sphinx_gallery.gen_gallery' + 'sphinx_gallery.gen_gallery', + 'sphinx.ext.doctest' ] templates_path = ['_templates'] @@ -35,3 +36,6 @@ # Option to only need single backticks to refer to symbols default_role = 'any' +# Option to hide doctests comments in the documentation (like # doctest: +# +NORMALIZE_WHITESPACE for instance) +trim_doctest_flags = True diff --git a/doc/introduction.rst b/doc/introduction.rst index f0195c83..dad530b3 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -38,6 +38,8 @@ generally formulated as an optimization problem where one seeks to find the parameters of a distance function that optimize some objective function measuring the agreement with the training data. +.. _mahalanobis_distances: + Mahalanobis Distances ===================== @@ -124,7 +126,9 @@ to the following resources: .. Currently, each metric learning algorithm supports the following methods: .. - ``fit(...)``, which learns the model. -.. - ``metric()``, which returns a Mahalanobis matrix +.. - ``get_mahalanobis_matrix()``, which returns a Mahalanobis matrix +.. - ``get_metric()``, which returns a function that takes as input two 1D + arrays and outputs the learned metric score on these two points .. :math:`M = L^{\top}L` such that distance between vectors ``x`` and .. ``y`` can be computed as :math:`\sqrt{\left(x-y\right)M\left(x-y\right)}`. .. - ``transformer_from_metric(metric)``, which returns a transformation matrix diff --git a/metric_learn/_util.py b/metric_learn/_util.py index 3bc303f9..bd57fd5f 100644 --- a/metric_learn/_util.py +++ b/metric_learn/_util.py @@ -349,3 +349,14 @@ def transformer_from_metric(metric): else: w, V = np.linalg.eigh(metric) return V.T * np.sqrt(np.maximum(0, w[:, None])) + + +def validate_vector(u, dtype=None): + # replica of scipy.spatial.distance._validate_vector, for making scipy + # compatible functions on vectors (such as distances computations) + u = np.asarray(u, dtype=dtype, order='c').squeeze() + # Ensure values such as u=1 and u=[1] still return 1-D arrays. + u = np.atleast_1d(u) + if u.ndim > 1: + raise ValueError("Input vector should be 1-D.") + return u diff --git a/metric_learn/base_metric.py b/metric_learn/base_metric.py index bfec1264..58b8cc5d 100644 --- a/metric_learn/base_metric.py +++ b/metric_learn/base_metric.py @@ -1,10 +1,13 @@ +from numpy.linalg import cholesky +from scipy.spatial.distance import euclidean from sklearn.base import BaseEstimator from sklearn.utils.validation import _is_arraylike from sklearn.metrics import roc_auc_score import numpy as np from abc import ABCMeta, abstractmethod import six -from ._util import ArrayIndexer, check_input +from ._util import ArrayIndexer, check_input, validate_vector +import warnings class BaseMetricLearner(six.with_metaclass(ABCMeta, BaseEstimator)): @@ -34,6 +37,14 @@ def score_pairs(self, pairs): ------- scores: `numpy.ndarray` of shape=(n_pairs,) The score of every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference with `score_pairs` is that it works on two 1D + arrays and cannot use a preprocessor. Besides, the returned function is + independent of the metric learner and hence is not modified if the metric + learner is. """ def check_preprocessor(self): @@ -85,6 +96,47 @@ def _prepare_inputs(self, X, y=None, type_of_inputs='classic', tuple_size=getattr(self, '_tuple_size', None), **kwargs) + @abstractmethod + def get_metric(self): + """Returns a function that takes as input two 1D arrays and outputs the + learned metric score on these two points. + + This function will be independent from the metric learner that learned it + (it will not be modified if the initial metric learner is modified), + and it can be directly plugged into the `metric` argument of + scikit-learn's estimators. + + Returns + ------- + metric_fun : function + The function described above. + + + Examples + -------- + .. doctest:: + + >>> from metric_learn import NCA + >>> from sklearn.datasets import make_classification + >>> from sklearn.neighbors import KNeighborsClassifier + >>> nca = NCA() + >>> X, y = make_classification() + >>> nca.fit(X, y) + >>> knn = KNeighborsClassifier(metric=nca.get_metric()) + >>> knn.fit(X, y) # doctest: +NORMALIZE_WHITESPACE + KNeighborsClassifier(algorithm='auto', leaf_size=30, + metric=.metric_fun + at 0x...>, + metric_params=None, n_jobs=None, n_neighbors=5, p=2, + weights='uniform') + + See Also + -------- + score_pairs : a method that returns the metric score between several pairs + of points. Unlike `get_metric`, this is a method of the metric learner + and therefore can change if the metric learner changes. Besides, it can + use the metric learner's preprocessor, and works on concatenated arrays. + """ class MetricTransformer(six.with_metaclass(ABCMeta)): @@ -146,6 +198,17 @@ def score_pairs(self, pairs): ------- scores: `numpy.ndarray` of shape=(n_pairs,) The learned Mahalanobis distance for every pair. + + See Also + -------- + get_metric : a method that returns a function to compute the metric between + two points. The difference with `score_pairs` is that it works on two 1D + arrays and cannot use a preprocessor. Besides, the returned function is + independent of the metric learner and hence is not modified if the metric + learner is. + + :ref:`mahalanobis_distances` : The section of the project documentation + that describes Mahalanobis Distances. """ pairs = check_input(pairs, type_of_inputs='tuples', preprocessor=self.preprocessor_, @@ -177,7 +240,57 @@ def transform(self, X): accept_sparse=True) return X_checked.dot(self.transformer_.T) + def get_metric(self): + transformer_T = self.transformer_.T.copy() + + def metric_fun(u, v, squared=False): + """This function computes the metric between u and v, according to the + previously learned metric. + + Parameters + ---------- + u : array-like, shape=(n_features,) + The first point involved in the distance computation. + + v : array-like, shape=(n_features,) + The second point involved in the distance computation. + + squared : `bool` + If True, the function will return the squared metric between u and + v, which is faster to compute. + + Returns + ------- + distance: float + The distance between u and v according to the new metric. + """ + u = validate_vector(u) + v = validate_vector(v) + transformed_diff = (u - v).dot(transformer_T) + dist = np.dot(transformed_diff, transformed_diff.T) + if not squared: + dist = np.sqrt(dist) + return dist + + return metric_fun + + get_metric.__doc__ = BaseMetricLearner.get_metric.__doc__ + def metric(self): + # TODO: remove this method in version 0.6.0 + warnings.warn(("`metric` is deprecated since version 0.5.0 and will be " + "removed in 0.6.0. Use `get_mahalanobis_matrix` instead."), + DeprecationWarning) + return self.get_mahalanobis_matrix() + + def get_mahalanobis_matrix(self): + """Returns a copy of the Mahalanobis matrix learned by the metric learner. + + Returns + ------- + M : `numpy.ndarray`, shape=(n_components, n_features) + The copy of the learned Mahalanobis matrix. + """ return self.transformer_.T.dot(self.transformer_) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 3380f4c9..c9fedd59 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -112,7 +112,7 @@ def fit(self, X, chunks): chunks = np.asanyarray(chunks, dtype=int) chunk_mask, chunked_data = _chunk_mean_centering(X_t, chunks) - inner_cov = np.cov(chunked_data, rowvar=0, bias=1) + inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1)) dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X_t) # Fisher Linear Discriminant projection @@ -122,13 +122,13 @@ def fit(self, X, chunks): vals, vecs = np.linalg.eig(tmp) inds = np.argsort(vals)[:dim] A = vecs[:, inds] - inner_cov = A.T.dot(inner_cov).dot(A) + inner_cov = np.atleast_2d(A.T.dot(inner_cov).dot(A)) self.transformer_ = _inv_sqrtm(inner_cov).dot(A.T) else: self.transformer_ = _inv_sqrtm(inner_cov).T if M_pca is not None: - self.transformer_ = self.transformer_.dot(M_pca) + self.transformer_ = np.atleast_2d(self.transformer_.dot(M_pca)) return self diff --git a/metric_learn/sdml.py b/metric_learn/sdml.py index 40fd5727..be45d3a3 100644 --- a/metric_learn/sdml.py +++ b/metric_learn/sdml.py @@ -58,7 +58,7 @@ def _fit(self, pairs, y): # set up prior M if self.use_cov: X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])}) - self.M_ = pinvh(np.cov(X, rowvar = False)) + self.M_ = pinvh(np.atleast_2d(np.cov(X, rowvar = False))) else: self.M_ = np.identity(pairs.shape[2]) diff = pairs[:, 0] - pairs[:, 1] diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index eebce1f9..e4ce8cef 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -273,7 +273,7 @@ def test_iris(self): self.assertLess(csep, 0.15) # Sanity checks for learned matrices. - self.assertEqual(lfda.metric().shape, (4, 4)) + self.assertEqual(lfda.get_mahalanobis_matrix().shape, (4, 4)) self.assertEqual(lfda.transformer_.shape, (2, 4)) @@ -348,14 +348,16 @@ def test_iris(self): [+0.000868, +0.001468, -0.002021, -0.002879], [-0.001195, -0.002021, +0.002782, +0.003964], [-0.001703, -0.002879, +0.003964, +0.005648]] - assert_array_almost_equal(expected, mmc.metric(), decimal=6) + assert_array_almost_equal(expected, mmc.get_mahalanobis_matrix(), + decimal=6) # Diagonal metric mmc = MMC(diagonal=True) mmc.fit(*wrap_pairs(self.iris_points, [a,b,c,d])) expected = [0, 0, 1.210220, 1.228596] - assert_array_almost_equal(np.diag(expected), mmc.metric(), decimal=6) - + assert_array_almost_equal(np.diag(expected), mmc.get_mahalanobis_matrix(), + decimal=6) + # Supervised Full mmc = MMC_Supervised() mmc.fit(self.iris_points, self.iris_labels) diff --git a/test/test_base_metric.py b/test/test_base_metric.py index fdea2949..09718c29 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -1,5 +1,10 @@ +import pytest import unittest import metric_learn +import numpy as np +from sklearn import clone +from sklearn.utils.testing import set_random_state +from test.test_utils import ids_metric_learners, metric_learners class TestStringRepr(unittest.TestCase): @@ -81,5 +86,82 @@ def test_mmc(self): num_labeled='deprecated', preprocessor=None, verbose=False) """.strip('\n')) + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_is_independent_from_metric_learner(estimator, + build_dataset): + """Tests that the get_metric method returns a function that is independent + from the original metric learner""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + + # we fit the metric learner on it and then we compute the metric on some + # points + model.fit(input_data, labels) + metric = model.get_metric() + score = metric(X[0], X[1]) + + # then we refit the estimator on another dataset + model.fit(np.sin(input_data), labels) + + # we recompute the distance between the two points: it should be the same + score_bis = metric(X[0], X[1]) + assert score_bis == score + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_raises_error(estimator, build_dataset): + """Tests that the metric returned by get_metric raises errors similar to + the distance functions in scipy.spatial.distance""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(input_data, labels) + metric = model.get_metric() + + list_test_get_metric_raises = [(X[0].tolist() + [5.2], X[1]), # vectors with + # different dimensions + (X[0:4], X[1:5]), # 2D vectors + (X[0].tolist() + [5.2], X[1] + [7.2])] + # vectors of same dimension but incompatible with what the metric learner + # was trained on + + for u, v in list_test_get_metric_raises: + with pytest.raises(ValueError): + metric(u, v) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_works_does_not_raise(estimator, build_dataset): + """Tests that the metric returned by get_metric does not raise errors (or + warnings) similarly to the distance functions in scipy.spatial.distance""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(input_data, labels) + metric = model.get_metric() + + list_test_get_metric_doesnt_raise = [(X[0], X[1]), + (X[0].tolist(), X[1].tolist()), + (X[0][None], X[1][None])] + + for u, v in list_test_get_metric_doesnt_raise: + with pytest.warns(None) as record: + metric(u, v) + assert len(record) == 0 + + # Test that the scalar case works + model.transformer_ = np.array([3.1]) + metric = model.get_metric() + for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]: + with pytest.warns(None) as record: + metric(u, v) + assert len(record) == 0 + + if __name__ == '__main__': unittest.main() diff --git a/test/test_mahalanobis_mixin.py b/test/test_mahalanobis_mixin.py index 0d834f10..1e555e73 100644 --- a/test/test_mahalanobis_mixin.py +++ b/test/test_mahalanobis_mixin.py @@ -2,9 +2,10 @@ import pytest import numpy as np -from numpy.testing import assert_array_almost_equal -from scipy.spatial.distance import pdist, squareform +from numpy.testing import assert_array_almost_equal, assert_allclose +from scipy.spatial.distance import pdist, squareform, mahalanobis from sklearn import clone +from sklearn.cluster import DBSCAN from sklearn.utils import check_random_state from sklearn.utils.testing import set_random_state @@ -167,3 +168,118 @@ def test_embed_is_linear(estimator, build_dataset): model.transform(X[10:20])) assert_array_almost_equal(model.transform(5 * X[:10]), 5 * model.transform(X[:10])) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_equivalent_to_explicit_mahalanobis(estimator, + build_dataset): + """Tests that using the get_metric method of mahalanobis metric learners is + equivalent to explicitely calling scipy's mahalanobis metric + """ + rng = np.random.RandomState(42) + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(input_data, labels) + metric = model.get_metric() + n_features = X.shape[1] + a, b = (rng.randn(n_features), rng.randn(n_features)) + expected_dist = mahalanobis(a[None], b[None], + VI=model.get_mahalanobis_matrix()) + assert_allclose(metric(a, b), expected_dist, rtol=1e-15) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_is_pseudo_metric(estimator, build_dataset): + """Tests that the get_metric method of mahalanobis metric learners returns a + pseudo-metric (metric but without one side of the equivalence of + the identity of indiscernables property) + """ + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(input_data, labels) + metric = model.get_metric() + + n_features = X.shape[1] + for seed in range(10): + rng = np.random.RandomState(seed) + a, b, c = (rng.randn(n_features) for _ in range(3)) + assert metric(a, b) >= 0 # positivity + assert metric(a, b) == metric(b, a) # symmetry + # one side of identity indiscernables: x == y => d(x, y) == 0. The other + # side of the equivalence is not always true for Mahalanobis distances. + assert metric(a, a) == 0 + # triangular inequality + assert (metric(a, c) < metric(a, b) + metric(b, c) or + np.isclose(metric(a, c), metric(a, b) + metric(b, c), rtol=1e-20)) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_metric_raises_deprecation_warning(estimator, build_dataset): + """assert that a deprecation warning is raised if someones wants to call + the `metric` function""" + # TODO: remove this method in version 0.6.0 + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(input_data, labels) + + with pytest.warns(DeprecationWarning) as raised_warning: + model.metric() + assert (str(raised_warning[0].message) == + ("`metric` is deprecated since version 0.5.0 and will be removed " + "in 0.6.0. Use `get_mahalanobis_matrix` instead.")) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_metric_compatible_with_scikit_learn(estimator, build_dataset): + """Check that the metric returned by get_metric is compatible with + scikit-learn's algorithms using a custom metric, DBSCAN for instance""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(input_data, labels) + clustering = DBSCAN(metric=model.get_metric()) + clustering.fit(X) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_get_squared_metric(estimator, build_dataset): + """Test that the squared metric returned is indeed the square of the + metric""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + model.fit(input_data, labels) + metric = model.get_metric() + + n_features = X.shape[1] + for seed in range(10): + rng = np.random.RandomState(seed) + a, b = (rng.randn(n_features) for _ in range(2)) + assert_allclose(metric(a, b, squared=True), + metric(a, b, squared=False)**2, + rtol=1e-15) + + +@pytest.mark.parametrize('estimator, build_dataset', metric_learners, + ids=ids_metric_learners) +def test_transformer_is_2D(estimator, build_dataset): + """Tests that the transformer of metric learners is 2D""" + input_data, labels, _, X = build_dataset() + model = clone(estimator) + set_random_state(model) + # test that it works for X.shape[1] features + model.fit(input_data, labels) + assert model.transformer_.shape == (X.shape[1], X.shape[1]) + + # test that it works for 1 feature + trunc_data = input_data[..., :1] + model.fit(trunc_data, labels) + assert model.transformer_.shape == (1, 1) # the transformer must be 2D diff --git a/test/test_transformer_metric_conversion.py b/test/test_transformer_metric_conversion.py index ab38d65e..59986011 100644 --- a/test/test_transformer_metric_conversion.py +++ b/test/test_transformer_metric_conversion.py @@ -20,60 +20,60 @@ def test_cov(self): cov = Covariance() cov.fit(self.X) L = cov.transformer_ - assert_array_almost_equal(L.T.dot(L), cov.metric()) + assert_array_almost_equal(L.T.dot(L), cov.get_mahalanobis_matrix()) def test_lsml_supervised(self): seed = np.random.RandomState(1234) lsml = LSML_Supervised(num_constraints=200) lsml.fit(self.X, self.y, random_state=seed) L = lsml.transformer_ - assert_array_almost_equal(L.T.dot(L), lsml.metric()) + assert_array_almost_equal(L.T.dot(L), lsml.get_mahalanobis_matrix()) def test_itml_supervised(self): seed = np.random.RandomState(1234) itml = ITML_Supervised(num_constraints=200) itml.fit(self.X, self.y, random_state=seed) L = itml.transformer_ - assert_array_almost_equal(L.T.dot(L), itml.metric()) + assert_array_almost_equal(L.T.dot(L), itml.get_mahalanobis_matrix()) def test_lmnn(self): lmnn = LMNN(k=5, learn_rate=1e-6, verbose=False) lmnn.fit(self.X, self.y) L = lmnn.transformer_ - assert_array_almost_equal(L.T.dot(L), lmnn.metric()) + assert_array_almost_equal(L.T.dot(L), lmnn.get_mahalanobis_matrix()) def test_sdml_supervised(self): seed = np.random.RandomState(1234) sdml = SDML_Supervised(num_constraints=1500) sdml.fit(self.X, self.y, random_state=seed) L = sdml.transformer_ - assert_array_almost_equal(L.T.dot(L), sdml.metric()) + assert_array_almost_equal(L.T.dot(L), sdml.get_mahalanobis_matrix()) def test_nca(self): n = self.X.shape[0] nca = NCA(max_iter=(100000//n)) nca.fit(self.X, self.y) L = nca.transformer_ - assert_array_almost_equal(L.T.dot(L), nca.metric()) + assert_array_almost_equal(L.T.dot(L), nca.get_mahalanobis_matrix()) def test_lfda(self): lfda = LFDA(k=2, num_dims=2) lfda.fit(self.X, self.y) L = lfda.transformer_ - assert_array_almost_equal(L.T.dot(L), lfda.metric()) + assert_array_almost_equal(L.T.dot(L), lfda.get_mahalanobis_matrix()) def test_rca_supervised(self): seed = np.random.RandomState(1234) rca = RCA_Supervised(num_dims=2, num_chunks=30, chunk_size=2) rca.fit(self.X, self.y, random_state=seed) L = rca.transformer_ - assert_array_almost_equal(L.T.dot(L), rca.metric()) + assert_array_almost_equal(L.T.dot(L), rca.get_mahalanobis_matrix()) def test_mlkr(self): mlkr = MLKR(num_dims=2) mlkr.fit(self.X, self.y) L = mlkr.transformer_ - assert_array_almost_equal(L.T.dot(L), mlkr.metric()) + assert_array_almost_equal(L.T.dot(L), mlkr.get_mahalanobis_matrix()) if __name__ == '__main__': diff --git a/test/test_utils.py b/test/test_utils.py index 39c718ac..5e640dbc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,6 +1,7 @@ import pytest from collections import namedtuple import numpy as np +from numpy.testing import assert_array_equal, assert_equal from sklearn.model_selection import train_test_split from sklearn.exceptions import DataConversionWarning from sklearn.utils import check_random_state, shuffle @@ -8,7 +9,7 @@ from sklearn.base import clone from metric_learn._util import (check_input, make_context, preprocess_tuples, make_name, preprocess_points, - check_collapsed_pairs) + check_collapsed_pairs, validate_vector) from metric_learn import (ITML, LSML, MMC, RCA, SDML, Covariance, LFDA, LMNN, MLKR, NCA, ITML_Supervised, LSML_Supervised, MMC_Supervised, RCA_Supervised, SDML_Supervised, @@ -1010,3 +1011,32 @@ def test_check_collapsed_pairs_raises_error(): assert str(e.value) == ("2 collapsed pairs found (where the left element is " "the same as the right element), out of 3 pairs in" " total.") + +def test__validate_vector(): + """Replica of scipy.spatial.tests.test_distance.test__validate_vector""" + x = [1, 2, 3] + y = validate_vector(x) + assert_array_equal(y, x) + + y = validate_vector(x, dtype=np.float64) + assert_array_equal(y, x) + assert_equal(y.dtype, np.float64) + + x = [1] + y = validate_vector(x) + assert_equal(y.ndim, 1) + assert_equal(y, x) + + x = 1 + y = validate_vector(x) + assert_equal(y.ndim, 1) + assert_equal(y, [x]) + + x = np.arange(5).reshape(1, -1, 1) + y = validate_vector(x) + assert_equal(y.ndim, 1) + assert_array_equal(y, x[0, :, 0]) + + x = [[1, 2], [3, 4]] + with pytest.raises(ValueError): + validate_vector(x)