Skip to content

[MRG] Remove preprocessing the data for RCA #194

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
35 changes: 14 additions & 21 deletions metric_learn/rca.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,18 @@ class RCA(MahalanobisMixin, TransformerMixin):
The learned linear transformation ``L``.
"""

def __init__(self, num_dims=None, pca_comps=None, preprocessor=None):
def __init__(self, num_dims=None, pca_comps='deprecated', preprocessor=None):
"""Initialize the learner.

Parameters
----------
num_dims : int, optional
embedding dimension (default: original dimension of data)

pca_comps : int, float, None or string
Number of components to keep during PCA preprocessing.
If None (default), does not perform PCA.
If ``0 < pca_comps < 1``, it is used as
the minimum explained variance ratio.
See sklearn.decomposition.PCA for more details.
pca_comps : Not used
.. deprecated:: 0.5.0
`pca_comps` was deprecated in version 0.5.0 and will
be removed in 0.6.0.

preprocessor : array-like, shape=(n_samples, n_features) or callable
The preprocessor to call to get tuples from indices. If array-like,
Expand Down Expand Up @@ -98,26 +96,24 @@ def fit(self, X, chunks):
When ``chunks[i] == -1``, point i doesn't belong to any chunklet.
When ``chunks[i] == j``, point i belongs to chunklet j.
"""
if self.pca_comps != 'deprecated':
warnings.warn('"pca_comps" parameter is not used.'
' It has been deprecated in version 0.5.0 and will be'
'removed in 0.6.0', DeprecationWarning)

X = self._prepare_inputs(X, ensure_min_samples=2)

# PCA projection to remove noise and redundant information.
if self.pca_comps is not None:
pca = decomposition.PCA(n_components=self.pca_comps)
X_t = pca.fit_transform(X)
M_pca = pca.components_
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that this code was giving a PCA initialization at the same time, so for now we'll remove it, but I think I'll do the PR about initialization before merging this PR into master, and then we can merge it into this PR to keep the same possibility of initialization with PCA

else:
X_t = X - X.mean(axis=0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this centering step gone?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess because we should remove any pre-processing step, but I agree I didn't talk about it at all, maybe we should keep the ChangedBehaviorWarning message below, but rather replace "no longer trained on a preprocessed version" by "no longer trained on centered data by default", and encourage to use a "StandardScaler" if needed ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough (I double-checked and this centering is not part of standard RCA)
Maybe keep ChangedBehaviorWarning but change to "no longer center the data before training RCA" (no need to mention scaler I think)
And in the deprecation warning, add the fact that PCA preprocessing should now be done by the user

Finally, have you checked the influence of removing the centering step on the examples?

M_pca = None

chunks = np.asanyarray(chunks, dtype=int)
chunk_mask, chunked_data = _chunk_mean_centering(X_t, chunks)
chunk_mask, chunked_data = _chunk_mean_centering(X, chunks)

inner_cov = np.atleast_2d(np.cov(chunked_data, rowvar=0, bias=1))
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X_t)
dim = self._check_dimension(np.linalg.matrix_rank(inner_cov), X)

# Fisher Linear Discriminant projection
if dim < X_t.shape[1]:
total_cov = np.cov(X_t[chunk_mask], rowvar=0)
if dim < X.shape[1]:
total_cov = np.cov(X[chunk_mask], rowvar=0)
tmp = np.linalg.lstsq(total_cov, inner_cov)[0]
vals, vecs = np.linalg.eig(tmp)
inds = np.argsort(vals)[:dim]
Expand All @@ -127,9 +123,6 @@ def fit(self, X, chunks):
else:
self.transformer_ = _inv_sqrtm(inner_cov).T

if M_pca is not None:
self.transformer_ = np.atleast_2d(self.transformer_.dot(M_pca))

return self


Expand Down
19 changes: 18 additions & 1 deletion test/metric_learn_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
HAS_SKGGM = True
from metric_learn import (LMNN, NCA, LFDA, Covariance, MLKR, MMC,
LSML_Supervised, ITML_Supervised, SDML_Supervised,
RCA_Supervised, MMC_Supervised, SDML)
RCA_Supervised, MMC_Supervised, SDML, RCA)
# Import this specially for testing.
from metric_learn.constraints import wrap_pairs
from metric_learn.lmnn import python_LMNN
Expand Down Expand Up @@ -530,6 +530,23 @@ def test_feature_null_variance(self):
csep = class_separation(rca.transform(X), self.iris_labels)
self.assertLess(csep, 0.30)

def test_deprecation_pca_comps(self):
# test that a deprecation message is thrown if pca_comps is set at
# initialization
# TODO: remove in v.0.6
X, y = make_classification(random_state=42, n_samples=100)
rca_supervised = RCA_Supervised(pca_comps=X.shape[1], num_chunks=20)
msg = ('"pca_comps" parameter is not used.'
' It has been deprecated in version 0.5.0 and will be'
'removed in 0.6.0')
assert_warns_message(DeprecationWarning, msg, rca_supervised.fit, X, y)

rca = RCA(pca_comps=X.shape[1])
msg = ('"pca_comps" parameter is not used.'
' It has been deprecated in version 0.5.0 and will be'
'removed in 0.6.0')
assert_warns_message(DeprecationWarning, msg, rca.fit, X, y)


class TestMLKR(MetricTestCase):
def test_iris(self):
Expand Down
3 changes: 2 additions & 1 deletion test/test_base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def test_sdml(self):

def test_rca(self):
self.assertEqual(str(metric_learn.RCA()),
"RCA(num_dims=None, pca_comps=None, preprocessor=None)")
"RCA(num_dims=None, pca_comps='deprecated', "
"preprocessor=None)")
self.assertEqual(str(metric_learn.RCA_Supervised()),
"RCA_Supervised(chunk_size=2, num_chunks=100, "
"num_dims=None, pca_comps=None,\n "
Expand Down
5 changes: 2 additions & 3 deletions test/test_sklearn_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,8 @@ def stable_init(self, sparsity_param=0.01, num_labeled='deprecated',
dSDML.__init__ = stable_init
check_estimator(dSDML)

# This fails because the default num_chunks isn't data-dependent.
# def test_rca(self):
# check_estimator(RCA_Supervised)
def test_rca(self):
check_estimator(RCA_Supervised)


RNG = check_random_state(0)
Expand Down