Skip to content

Commit 8518517

Browse files
wdevazelhesbellet
authored andcommitted
[MRG] Remove preprocessing the data for RCA (#194)
* Remove initialization of the data for RCA * Add deprecated flag for supervised version too * Remove comment saying we'll do PCA * Add ChangedBehaviorWarning and do tests * improve change behavior warning * Update message in case covariance matrix is not invertible * FIX: still ignore testing RCA while fixed in #198 * Some reformatting * Fix test string * TST: add test for warning message when covariance is not definite * Address #194 (comment)
1 parent a22c2e6 commit 8518517

File tree

3 files changed

+92
-29
lines changed

3 files changed

+92
-29
lines changed

metric_learn/rca.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from six.moves import xrange
1818
from sklearn import decomposition
1919
from sklearn.base import TransformerMixin
20+
from sklearn.exceptions import ChangedBehaviorWarning
2021

2122
from ._util import _check_n_components
2223
from .base_metric import MahalanobisMixin
@@ -48,7 +49,7 @@ class RCA(MahalanobisMixin, TransformerMixin):
4849
"""
4950

5051
def __init__(self, n_components=None, num_dims='deprecated',
51-
pca_comps=None, preprocessor=None):
52+
pca_comps='deprecated', preprocessor=None):
5253
"""Initialize the learner.
5354
5455
Parameters
@@ -62,12 +63,10 @@ def __init__(self, n_components=None, num_dims='deprecated',
6263
`num_dims` was deprecated in version 0.5.0 and will
6364
be removed in 0.6.0. Use `n_components` instead.
6465
65-
pca_comps : int, float, None or string
66-
Number of components to keep during PCA preprocessing.
67-
If None (default), does not perform PCA.
68-
If ``0 < pca_comps < 1``, it is used as
69-
the minimum explained variance ratio.
70-
See sklearn.decomposition.PCA for more details.
66+
pca_comps : Not used
67+
.. deprecated:: 0.5.0
68+
`pca_comps` was deprecated in version 0.5.0 and will
69+
be removed in 0.6.0.
7170
7271
preprocessor : array-like, shape=(n_samples, n_features) or callable
7372
The preprocessor to call to get tuples from indices. If array-like,
@@ -83,8 +82,9 @@ def _check_dimension(self, rank, X):
8382
if rank < d:
8483
warnings.warn('The inner covariance matrix is not invertible, '
8584
'so the transformation matrix may contain Nan values. '
86-
'You should adjust pca_comps to remove noise and '
87-
'redundant information.')
85+
'You should reduce the dimensionality of your input,'
86+
'for instance using `sklearn.decomposition.PCA` as a '
87+
'preprocessing step.')
8888

8989
dim = _check_n_components(d, self.n_components)
9090
return dim
@@ -105,25 +105,33 @@ def fit(self, X, chunks):
105105
' It has been deprecated in version 0.5.0 and will be'
106106
' removed in 0.6.0. Use "n_components" instead',
107107
DeprecationWarning)
108+
109+
if self.pca_comps != 'deprecated':
110+
warnings.warn(
111+
'"pca_comps" parameter is not used. '
112+
'It has been deprecated in version 0.5.0 and will be'
113+
'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If '
114+
'you still want to do it, you could use '
115+
'`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.',
116+
DeprecationWarning)
117+
108118
X, chunks = self._prepare_inputs(X, chunks, ensure_min_samples=2)
109119

110-
# PCA projection to remove noise and redundant information.
111-
if self.pca_comps is not None:
112-
pca = decomposition.PCA(n_components=self.pca_comps)
113-
X_t = pca.fit_transform(X)
114-
M_pca = pca.components_
115-
else:
116-
X_t = X - X.mean(axis=0)
117-
M_pca = None
120+
warnings.warn(
121+
"RCA will no longer center the data before training. If you want "
122+
"to do some preprocessing, you should do it manually (you can also "
123+
"use an `sklearn.pipeline.Pipeline` for instance). This warning "
124+
"will disappear in version 0.6.0.", ChangedBehaviorWarning)
118125

119-
chunk_mask, chunked_data = _chunk_mean_centering(X_t, chunks)
126+
chunks = np.asanyarray(chunks, dtype=int)
127+
chunk_mask, chunked_data = _chunk_mean_centering(X, chunks)
120128

121129
inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1))
122-
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X_t)
130+
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X)
123131

124132
# Fisher Linear Discriminant projection
125-
if dim < X_t.shape[1]:
126-
total_cov = np.cov(X_t[chunk_mask], rowvar=0)
133+
if dim < X.shape[1]:
134+
total_cov = np.cov(X[chunk_mask], rowvar=0)
127135
tmp = np.linalg.lstsq(total_cov, inner_cov)[0]
128136
vals, vecs = np.linalg.eig(tmp)
129137
inds = np.argsort(vals)[:dim]
@@ -133,9 +141,6 @@ def fit(self, X, chunks):
133141
else:
134142
self.transformer_ = _inv_sqrtm(inner_cov).T
135143

136-
if M_pca is not None:
137-
self.transformer_ = np.atleast_2d(self.transformer_.dot(M_pca))
138-
139144
return self
140145

141146

@@ -155,7 +160,7 @@ class RCA_Supervised(RCA):
155160
"""
156161

157162
def __init__(self, num_dims='deprecated', n_components=None,
158-
pca_comps=None, num_chunks=100, chunk_size=2,
163+
pca_comps='deprecated', num_chunks=100, chunk_size=2,
159164
preprocessor=None):
160165
"""Initialize the supervised version of `RCA`.
161166

test/metric_learn_test.py

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
HAS_SKGGM = False
1919
else:
2020
HAS_SKGGM = True
21-
from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC, RCA,
21+
from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC,
2222
LSML_Supervised, ITML_Supervised, SDML_Supervised,
23-
RCA_Supervised, MMC_Supervised, SDML, ITML, LSML)
23+
RCA_Supervised, MMC_Supervised, SDML, RCA, ITML,
24+
LSML)
2425
# Import this specially for testing.
2526
from metric_learn.constraints import wrap_pairs
2627
from metric_learn.lmnn import _sum_outer_products
@@ -837,6 +838,63 @@ def test_feature_null_variance(self):
837838
csep = class_separation(rca.transform(X), self.iris_labels)
838839
self.assertLess(csep, 0.30)
839840

841+
def test_deprecation_pca_comps(self):
842+
# test that a deprecation message is thrown if pca_comps is set at
843+
# initialization
844+
# TODO: remove in v.0.6
845+
X, y = make_classification(random_state=42, n_samples=100)
846+
rca_supervised = RCA_Supervised(pca_comps=X.shape[1], num_chunks=20)
847+
msg = ('"pca_comps" parameter is not used. '
848+
'It has been deprecated in version 0.5.0 and will be'
849+
'removed in 0.6.0. RCA will not do PCA preprocessing anymore. If '
850+
'you still want to do it, you could use '
851+
'`sklearn.decomposition.PCA` and an `sklearn.pipeline.Pipeline`.')
852+
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
853+
rca_supervised.fit(X, y)
854+
assert str(expected_msg[0].message) == msg
855+
856+
rca = RCA(pca_comps=X.shape[1])
857+
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
858+
rca.fit(X, y)
859+
assert str(expected_msg[0].message) == msg
860+
861+
def test_changedbehaviorwarning_preprocessing(self):
862+
# test that a ChangedBehaviorWarning is thrown when using RCA
863+
# TODO: remove in v.0.6
864+
865+
msg = ("RCA will no longer center the data before training. If you want "
866+
"to do some preprocessing, you should do it manually (you can also "
867+
"use an `sklearn.pipeline.Pipeline` for instance). This warning "
868+
"will disappear in version 0.6.0.")
869+
870+
X, y = make_classification(random_state=42, n_samples=100)
871+
rca_supervised = RCA_Supervised(num_chunks=20)
872+
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
873+
rca_supervised.fit(X, y)
874+
assert str(expected_msg[0].message) == msg
875+
876+
rca = RCA()
877+
with pytest.warns(ChangedBehaviorWarning) as expected_msg:
878+
rca.fit(X, y)
879+
assert str(expected_msg[0].message) == msg
880+
881+
def test_rank_deficient_returns_warning(self):
882+
"""Checks that if the covariance matrix is not invertible, we raise a
883+
warning message advising to use PCA"""
884+
X, y = load_iris(return_X_y=True)
885+
# we make the fourth column a linear combination of the two first,
886+
# so that the covariance matrix will not be invertible:
887+
X[:, 3] = X[:, 0] + 3 * X[:, 1]
888+
rca = RCA()
889+
msg = ('The inner covariance matrix is not invertible, '
890+
'so the transformation matrix may contain Nan values. '
891+
'You should reduce the dimensionality of your input,'
892+
'for instance using `sklearn.decomposition.PCA` as a '
893+
'preprocessing step.')
894+
with pytest.warns(None) as raised_warnings:
895+
rca.fit(X, y)
896+
assert any(str(w.message) == msg for w in raised_warnings)
897+
840898

841899
@pytest.mark.parametrize('num_dims', [None, 2])
842900
def test_deprecation_num_dims_rca(num_dims):

test/test_base_metric.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,13 +89,13 @@ def test_rca(self):
8989
self.assertEqual(remove_spaces(str(metric_learn.RCA())),
9090
remove_spaces("RCA(n_components=None, "
9191
"num_dims='deprecated', "
92-
"pca_comps=None, "
92+
"pca_comps='deprecated', "
9393
"preprocessor=None)"))
9494
self.assertEqual(remove_spaces(str(metric_learn.RCA_Supervised())),
9595
remove_spaces(
9696
"RCA_Supervised(chunk_size=2, "
9797
"n_components=None, num_chunks=100, "
98-
"num_dims='deprecated', pca_comps=None, "
98+
"num_dims='deprecated', pca_comps='deprecated', "
9999
"preprocessor=None)"))
100100

101101
def test_mlkr(self):

0 commit comments

Comments
 (0)