From e21e856b0045d2d751d738e1de4907935cb87a3d Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 23 Apr 2019 08:43:08 +0200 Subject: [PATCH 01/11] Remove initialization of the data for RCA --- metric_learn/rca.py | 35 ++++++++++++++--------------------- test/metric_learn_test.py | 19 ++++++++++++++++++- test/test_base_metric.py | 3 ++- test/test_sklearn_compat.py | 5 ++--- 4 files changed, 36 insertions(+), 26 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index c9fedd59..4587293b 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -44,7 +44,7 @@ class RCA(MahalanobisMixin, TransformerMixin): The learned linear transformation ``L``. """ - def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): + def __init__(self, num_dims=None, pca_comps='deprecated', preprocessor=None): """Initialize the learner. Parameters @@ -52,12 +52,10 @@ def __init__(self, num_dims=None, pca_comps=None, preprocessor=None): num_dims : int, optional embedding dimension (default: original dimension of data) - pca_comps : int, float, None or string - Number of components to keep during PCA preprocessing. - If None (default), does not perform PCA. - If ``0 < pca_comps < 1``, it is used as - the minimum explained variance ratio. - See sklearn.decomposition.PCA for more details. + pca_comps : Not used + .. deprecated:: 0.5.0 + `pca_comps` was deprecated in version 0.5.0 and will + be removed in 0.6.0. preprocessor : array-like, shape=(n_samples, n_features) or callable The preprocessor to call to get tuples from indices. If array-like, @@ -98,26 +96,24 @@ def fit(self, X, chunks): When ``chunks[i] == -1``, point i doesn't belong to any chunklet. When ``chunks[i] == j``, point i belongs to chunklet j. """ + if self.pca_comps != 'deprecated': + warnings.warn('"pca_comps" parameter is not used.' + ' It has been deprecated in version 0.5.0 and will be' + 'removed in 0.6.0', DeprecationWarning) + X = self._prepare_inputs(X, ensure_min_samples=2) # PCA projection to remove noise and redundant information. - if self.pca_comps is not None: - pca = decomposition.PCA(n_components=self.pca_comps) - X_t = pca.fit_transform(X) - M_pca = pca.components_ - else: - X_t = X - X.mean(axis=0) - M_pca = None chunks = np.asanyarray(chunks, dtype=int) - chunk_mask, chunked_data = _chunk_mean_centering(X_t, chunks) + chunk_mask, chunked_data = _chunk_mean_centering(X, chunks) inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1)) - dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X_t) + dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X) # Fisher Linear Discriminant projection - if dim < X_t.shape[1]: - total_cov = np.cov(X_t[chunk_mask], rowvar=0) + if dim < X.shape[1]: + total_cov = np.cov(X[chunk_mask], rowvar=0) tmp = np.linalg.lstsq(total_cov, inner_cov)[0] vals, vecs = np.linalg.eig(tmp) inds = np.argsort(vals)[:dim] @@ -127,9 +123,6 @@ def fit(self, X, chunks): else: self.transformer_ = _inv_sqrtm(inner_cov).T - if M_pca is not None: - self.transformer_ = np.atleast_2d(self.transformer_.dot(M_pca)) - return self diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index a785d60d..29718d1b 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -18,7 +18,7 @@ HAS_SKGGM = True from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, LSML_Supervised, ITML_Supervised, SDML_Supervised, - RCA_Supervised, MMC_Supervised, SDML) + RCA_Supervised, MMC_Supervised, SDML, RCA) # Import this specially for testing. from metric_learn.constraints import wrap_pairs from metric_learn.lmnn import python_LMNN @@ -530,6 +530,23 @@ def test_feature_null_variance(self): csep = class_separation(rca.transform(X), self.iris_labels) self.assertLess(csep, 0.30) + def test_deprecation_pca_comps(self): + # test that a deprecation message is thrown if pca_comps is set at + # initialization + # TODO: remove in v.0.6 + X, y = make_classification(random_state=42, n_samples=100) + rca_supervised = RCA_Supervised(pca_comps=X.shape[1], num_chunks=20) + msg = ('"pca_comps" parameter is not used.' + ' It has been deprecated in version 0.5.0 and will be' + 'removed in 0.6.0') + assert_warns_message(DeprecationWarning, msg, rca_supervised.fit, X, y) + + rca = RCA(pca_comps=X.shape[1]) + msg = ('"pca_comps" parameter is not used.' + ' It has been deprecated in version 0.5.0 and will be' + 'removed in 0.6.0') + assert_warns_message(DeprecationWarning, msg, rca.fit, X, y) + class TestMLKR(MetricTestCase): def test_iris(self): diff --git a/test/test_base_metric.py b/test/test_base_metric.py index 6c9a6dc5..f7c1dbe5 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -64,7 +64,8 @@ def test_sdml(self): def test_rca(self): self.assertEqual(str(metric_learn.RCA()), - "RCA(num_dims=None, pca_comps=None, preprocessor=None)") + "RCA(num_dims=None, pca_comps='deprecated', " + "preprocessor=None)") self.assertEqual(str(metric_learn.RCA_Supervised()), "RCA_Supervised(chunk_size=2, num_chunks=100, " "num_dims=None, pca_comps=None,\n " diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index 5d6c5d77..f9e564a6 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -89,9 +89,8 @@ def stable_init(self, sparsity_param=0.01, num_labeled='deprecated', dSDML.__init__ = stable_init check_estimator(dSDML) - # This fails because the default num_chunks isn't data-dependent. - # def test_rca(self): - # check_estimator(RCA_Supervised) + def test_rca(self): + check_estimator(RCA_Supervised) RNG = check_random_state(0) From 3882b512b2baebe82804b959ddc05c12d09bdb41 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 23 Apr 2019 09:03:08 +0200 Subject: [PATCH 02/11] Add deprecated flag for supervised version too --- metric_learn/rca.py | 2 +- test/test_base_metric.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 4587293b..2c091e93 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -141,7 +141,7 @@ class RCA_Supervised(RCA): The learned linear transformation ``L``. """ - def __init__(self, num_dims=None, pca_comps=None, num_chunks=100, + def __init__(self, num_dims=None, pca_comps='deprecated', num_chunks=100, chunk_size=2, preprocessor=None): """Initialize the supervised version of `RCA`. diff --git a/test/test_base_metric.py b/test/test_base_metric.py index f7c1dbe5..ffd531ff 100644 --- a/test/test_base_metric.py +++ b/test/test_base_metric.py @@ -68,7 +68,7 @@ def test_rca(self): "preprocessor=None)") self.assertEqual(str(metric_learn.RCA_Supervised()), "RCA_Supervised(chunk_size=2, num_chunks=100, " - "num_dims=None, pca_comps=None,\n " + "num_dims=None,\n pca_comps='deprecated', " "preprocessor=None)") def test_mlkr(self): From b6c216e90e1b3e3eace2d3208dfb3a8115908d1d Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 23 Apr 2019 09:04:10 +0200 Subject: [PATCH 03/11] Remove comment saying we'll do PCA --- metric_learn/rca.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 2c091e93..ab7273e9 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -103,8 +103,6 @@ def fit(self, X, chunks): X = self._prepare_inputs(X, ensure_min_samples=2) - # PCA projection to remove noise and redundant information. - chunks = np.asanyarray(chunks, dtype=int) chunk_mask, chunked_data = _chunk_mean_centering(X, chunks) From 32b86c0a6e7b0183f04397eca41af4529c033e3d Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 23 Apr 2019 09:22:58 +0200 Subject: [PATCH 04/11] Add ChangedBehaviorWarning and do tests --- metric_learn/rca.py | 9 +++++++++ test/metric_learn_test.py | 35 +++++++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index ab7273e9..dd95210e 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -17,6 +17,7 @@ from six.moves import xrange from sklearn import decomposition from sklearn.base import TransformerMixin +from sklearn.exceptions import ChangedBehaviorWarning from .base_metric import MahalanobisMixin from .constraints import Constraints @@ -103,6 +104,14 @@ def fit(self, X, chunks): X = self._prepare_inputs(X, ensure_min_samples=2) + warnings.warn("RCA will no longer be trained on a preprocessed version " + "of the input as before (which was a bug since it was not " + "coherent with transform time). If you want to do some " + "preprocessing, you should do it manually (you can also use " + "an sklearn.pipeline.Pipeline for instance). This warning " + "will disappear in version 0.6.0.", + ChangedBehaviorWarning) + chunks = np.asanyarray(chunks, dtype=int) chunk_mask, chunked_data = _chunk_mean_centering(X, chunks) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 29718d1b..89075371 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -8,7 +8,7 @@ from sklearn.datasets import load_iris, make_classification, make_regression from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn.utils.testing import assert_warns_message -from sklearn.exceptions import ConvergenceWarning +from sklearn.exceptions import ConvergenceWarning, ChangedBehaviorWarning from sklearn.utils.validation import check_X_y try: from inverse_covariance import quic @@ -539,13 +539,36 @@ def test_deprecation_pca_comps(self): msg = ('"pca_comps" parameter is not used.' ' It has been deprecated in version 0.5.0 and will be' 'removed in 0.6.0') - assert_warns_message(DeprecationWarning, msg, rca_supervised.fit, X, y) + with pytest.warns(ChangedBehaviorWarning) as expected_msg: + rca_supervised.fit(X, y) + assert str(expected_msg[0].message) == msg rca = RCA(pca_comps=X.shape[1]) - msg = ('"pca_comps" parameter is not used.' - ' It has been deprecated in version 0.5.0 and will be' - 'removed in 0.6.0') - assert_warns_message(DeprecationWarning, msg, rca.fit, X, y) + with pytest.warns(ChangedBehaviorWarning) as expected_msg: + rca.fit(X, y) + assert str(expected_msg[0].message) == msg + + def test_changedbehaviorwarning_preprocessing(self): + # test that a ChangedBehaviorWarning is thrown when using RCA + # TODO: remove in v.0.6 + + msg = ("RCA will no longer be trained on a preprocessed version " + "of the input as before (which was a bug since it was not " + "coherent with transform time). If you want to do some " + "preprocessing, you should do it manually (you can also use " + "an sklearn.pipeline.Pipeline for instance). This warning " + "will disappear in version 0.6.0.") + + X, y = make_classification(random_state=42, n_samples=100) + rca_supervised = RCA_Supervised(num_chunks=20) + with pytest.warns(ChangedBehaviorWarning) as expected_msg: + rca_supervised.fit(X, y) + assert str(expected_msg[0].message) == msg + + rca = RCA() + with pytest.warns(ChangedBehaviorWarning) as expected_msg: + rca.fit(X, y) + assert str(expected_msg[0].message) == msg class TestMLKR(MetricTestCase): From d6cdb8f7d6f7f92aec998800148f703455968fbe Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 23 Apr 2019 11:05:41 +0200 Subject: [PATCH 05/11] improve change behavior warning --- metric_learn/rca.py | 3 +-- test/metric_learn_test.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index dd95210e..30b4a8bc 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -105,8 +105,7 @@ def fit(self, X, chunks): X = self._prepare_inputs(X, ensure_min_samples=2) warnings.warn("RCA will no longer be trained on a preprocessed version " - "of the input as before (which was a bug since it was not " - "coherent with transform time). If you want to do some " + "of the input as before. If you want to do some " "preprocessing, you should do it manually (you can also use " "an sklearn.pipeline.Pipeline for instance). This warning " "will disappear in version 0.6.0.", diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 89075371..7e6113b8 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -553,8 +553,7 @@ def test_changedbehaviorwarning_preprocessing(self): # TODO: remove in v.0.6 msg = ("RCA will no longer be trained on a preprocessed version " - "of the input as before (which was a bug since it was not " - "coherent with transform time). If you want to do some " + "of the input as before. If you want to do some " "preprocessing, you should do it manually (you can also use " "an sklearn.pipeline.Pipeline for instance). This warning " "will disappear in version 0.6.0.") From 24973669856da94ee83763753978c3b694aea99a Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Tue, 23 Apr 2019 17:07:38 +0200 Subject: [PATCH 06/11] Update message in case covariance matrix is not invertible --- metric_learn/rca.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 30b4a8bc..5bcd4e3c 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -71,8 +71,9 @@ def _check_dimension(self, rank, X): if rank < d: warnings.warn('The inner covariance matrix is not invertible, ' 'so the transformation matrix may contain Nan values. ' - 'You should adjust pca_comps to remove noise and ' - 'redundant information.') + 'You should reduce the dimensionality of your input,' + 'for instance using sklearn.decomposition.PCA as a ' + 'preprocessing step.') if self.num_dims is None: dim = d From ea68d0da4b931d674af066bdc3b3ac728dfe2637 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 2 May 2019 16:39:04 +0200 Subject: [PATCH 07/11] FIX: still ignore testing RCA while fixed in #198 --- test/test_sklearn_compat.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_sklearn_compat.py b/test/test_sklearn_compat.py index f9e564a6..5d6c5d77 100644 --- a/test/test_sklearn_compat.py +++ b/test/test_sklearn_compat.py @@ -89,8 +89,9 @@ def stable_init(self, sparsity_param=0.01, num_labeled='deprecated', dSDML.__init__ = stable_init check_estimator(dSDML) - def test_rca(self): - check_estimator(RCA_Supervised) + # This fails because the default num_chunks isn't data-dependent. + # def test_rca(self): + # check_estimator(RCA_Supervised) RNG = check_random_state(0) From 27985ecf50a716e9c715fcb08befc944365268f4 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Jun 2019 15:10:33 +0200 Subject: [PATCH 08/11] Some reformatting --- metric_learn/rca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index d9e96f2b..f551ec97 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -74,7 +74,7 @@ def _check_dimension(self, rank, X): warnings.warn('The inner covariance matrix is not invertible, ' 'so the transformation matrix may contain Nan values. ' 'You should reduce the dimensionality of your input,' - 'for instance using sklearn.decomposition.PCA as a ' + 'for instance using `sklearn.decomposition.PCA` as a ' 'preprocessing step.') if self.num_dims is None: @@ -110,7 +110,7 @@ def fit(self, X, chunks): warnings.warn("RCA will no longer be trained on a preprocessed version " "of the input as before. If you want to do some " "preprocessing, you should do it manually (you can also use " - "an sklearn.pipeline.Pipeline for instance). This warning " + "an `sklearn.pipeline.Pipeline` for instance). This warning " "will disappear in version 0.6.0.", ChangedBehaviorWarning) From ddee294a6e481b853081206bf0b384efb28e953d Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Jun 2019 15:28:13 +0200 Subject: [PATCH 09/11] Fix test string --- test/metric_learn_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 98de3fc2..9d348652 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -686,7 +686,7 @@ def test_changedbehaviorwarning_preprocessing(self): msg = ("RCA will no longer be trained on a preprocessed version " "of the input as before. If you want to do some " "preprocessing, you should do it manually (you can also use " - "an sklearn.pipeline.Pipeline for instance). This warning " + "an `sklearn.pipeline.Pipeline` for instance). This warning " "will disappear in version 0.6.0.") X, y = make_classification(random_state=42, n_samples=100) From 80210895734cb5fd61de4d52ffc825e6993ca1b4 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Thu, 6 Jun 2019 17:56:15 +0200 Subject: [PATCH 10/11] TST: add test for warning message when covariance is not definite --- test/metric_learn_test.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index 9d348652..6154674c 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -700,6 +700,23 @@ def test_changedbehaviorwarning_preprocessing(self): rca.fit(X, y) assert str(expected_msg[0].message) == msg + def test_rank_deficient_returns_warning(self): + """Checks that if the covariance matrix is not invertible, we raise a + warning message advising to use PCA""" + X, y = load_iris(return_X_y=True) + # we make the fourth column a linear combination of the two first, + # so that the covariance matrix will not be invertible: + X[:, 3] = X[:, 0] + 3 * X[:, 1] + rca = RCA() + msg = ('The inner covariance matrix is not invertible, ' + 'so the transformation matrix may contain Nan values. ' + 'You should reduce the dimensionality of your input,' + 'for instance using `sklearn.decomposition.PCA` as a ' + 'preprocessing step.') + with pytest.warns(None) as raised_warnings: + rca.fit(X, y) + assert any(str(w.message) == msg for w in raised_warnings) + class TestMLKR(MetricTestCase): def test_iris(self): From 6d6a38aaadf9ea685ed4a674d718604ebef7bed4 Mon Sep 17 00:00:00 2001 From: William de Vazelhes Date: Wed, 12 Jun 2019 16:37:31 +0200 Subject: [PATCH 11/11] Address https://github.com/metric-learn/metric-learn/pull/194#discussion_r292387277 --- metric_learn/rca.py | 21 ++++++++++++--------- test/metric_learn_test.py | 15 ++++++++------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/metric_learn/rca.py b/metric_learn/rca.py index 7c4f7800..1dbffdd6 100644 --- a/metric_learn/rca.py +++ b/metric_learn/rca.py @@ -107,18 +107,21 @@ def fit(self, X, chunks): DeprecationWarning) if self.pca_comps != 'deprecated': - warnings.warn('"pca_comps" parameter is not used.' - ' It has been deprecated in version 0.5.0 and will be' - 'removed in 0.6.0', DeprecationWarning) + warnings.warn( + '"pca_comps" parameter is not used. ' + 'It has been deprecated in version 0.5.0 and will be' + 'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If ' + 'you still want to do it, you could use ' + '`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.', + DeprecationWarning) X, chunks = self._prepare_inputs(X, chunks, ensure_min_samples=2) - warnings.warn("RCA will no longer be trained on a preprocessed version " - "of the input as before. If you want to do some " - "preprocessing, you should do it manually (you can also use " - "an `sklearn.pipeline.Pipeline` for instance). This warning " - "will disappear in version 0.6.0.", - ChangedBehaviorWarning) + warnings.warn( + "RCA will no longer center the data before training. If you want " + "to do some preprocessing, you should do it manually (you can also " + "use an `sklearn.pipeline.Pipeline` for instance). This warning " + "will disappear in version 0.6.0.", ChangedBehaviorWarning) chunks = np.asanyarray(chunks, dtype=int) chunk_mask, chunked_data = _chunk_mean_centering(X, chunks) diff --git a/test/metric_learn_test.py b/test/metric_learn_test.py index ac7bfab4..7b82088e 100644 --- a/test/metric_learn_test.py +++ b/test/metric_learn_test.py @@ -829,9 +829,11 @@ def test_deprecation_pca_comps(self): # TODO: remove in v.0.6 X, y = make_classification(random_state=42, n_samples=100) rca_supervised = RCA_Supervised(pca_comps=X.shape[1], num_chunks=20) - msg = ('"pca_comps" parameter is not used.' - ' It has been deprecated in version 0.5.0 and will be' - 'removed in 0.6.0') + msg = ('"pca_comps" parameter is not used. ' + 'It has been deprecated in version 0.5.0 and will be' + 'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If ' + 'you still want to do it, you could use ' + '`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.') with pytest.warns(ChangedBehaviorWarning) as expected_msg: rca_supervised.fit(X, y) assert str(expected_msg[0].message) == msg @@ -845,10 +847,9 @@ def test_changedbehaviorwarning_preprocessing(self): # test that a ChangedBehaviorWarning is thrown when using RCA # TODO: remove in v.0.6 - msg = ("RCA will no longer be trained on a preprocessed version " - "of the input as before. If you want to do some " - "preprocessing, you should do it manually (you can also use " - "an `sklearn.pipeline.Pipeline` for instance). This warning " + msg = ("RCA will no longer center the data before training. If you want " + "to do some preprocessing, you should do it manually (you can also " + "use an `sklearn.pipeline.Pipeline` for instance). This warning " "will disappear in version 0.6.0.") X, y = make_classification(random_state=42, n_samples=100)