Skip to content

[MRG] EHN add voting paramter for ClusterCentroids #318

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Aug 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a68e8eb
EHN POC sparse handling for RandomUnderSampler
glemaitre Aug 12, 2017
0062d6d
EHN support sparse ENN
glemaitre Aug 12, 2017
6197d80
iter
glemaitre Aug 12, 2017
f669843
EHN sparse indexing IHT
glemaitre Aug 12, 2017
4adc6db
EHN sparse support nearmiss
glemaitre Aug 12, 2017
9c93dab
Merge branch 'master' into is/158
glemaitre Aug 13, 2017
bba7835
EHN support sparse matrices for NCR
glemaitre Aug 13, 2017
9cd917b
EHN support sparse Tomek and OSS
glemaitre Aug 13, 2017
c3ba307
EHN support sparsity for CNN
glemaitre Aug 13, 2017
d195868
EHN support sparse for SMOTE
glemaitre Aug 13, 2017
bcf44ab
EHN support sparse adasyn
glemaitre Aug 13, 2017
c405aa9
EHN support sparsity for sombine methods
glemaitre Aug 13, 2017
79637d7
EHN support sparsity BC
glemaitre Aug 13, 2017
c199af9
DOC update docstring
glemaitre Aug 14, 2017
425928f
DOC fix example topic classification
glemaitre Aug 14, 2017
4ba8c4e
FIX fix test and class clustercentroids
glemaitre Aug 14, 2017
8298fdc
TST add common test
glemaitre Aug 14, 2017
e4c6ebb
TST add ensemble
glemaitre Aug 14, 2017
1226a91
TST use allclose
glemaitre Aug 14, 2017
68b16b5
TST install conda with ubuntu container
glemaitre Aug 14, 2017
35c638b
TST increase tolerance
glemaitre Aug 14, 2017
004f920
TST increase tolerance
glemaitre Aug 14, 2017
d3ceb5a
TST test all versions NearMiss and SMOTE
glemaitre Aug 14, 2017
d9c4e55
TST set the algorithm of KMeans
glemaitre Aug 14, 2017
b469747
DOC add entry in user guide
glemaitre Aug 14, 2017
c05d0ba
DOC add entry sparse for CC
glemaitre Aug 14, 2017
1625879
DOC whatsnew entry
glemaitre Aug 14, 2017
72a605d
EHN add voting paramter for ClusterCentroids
glemaitre Aug 14, 2017
e1ffb13
TST fix common test fixing voting
glemaitre Aug 14, 2017
6c34e56
Merge remote-tracking branch 'origin/master' into is/317
glemaitre Aug 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/introduction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ Introduction
API's of imbalanced-learn samplers
----------------------------------

The available samplers follows the scikit-learn API using the base estimator and adding a sampling functionality throw the ``sample`` method::
The available samplers follows the scikit-learn API using the base estimator
and adding a sampling functionality throw the ``sample`` method::

:Estimator:

Expand Down
4 changes: 4 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ Bug fixes
New features
~~~~~~~~~~~~

- :class:`under_sampling.ClusterCentroids` accepts a parameter ``voting``
allowing to use nearest-neighbors of centroids instead of centroids
themselves. It is more efficient for sparse input. By `Guillaume Lemaitre`_.

- Turn off steps in :class:`pipeline.Pipeline` using the `None`
object. By `Christos Aridas`_.

Expand Down
40 changes: 31 additions & 9 deletions examples/under-sampling/plot_cluster_centroids.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=200, random_state=10)
n_samples=50, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
Expand All @@ -34,25 +34,46 @@
# Apply Cluster Centroids
cc = ClusterCentroids()
X_resampled, y_resampled = cc.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)
X_res_vis_soft = pca.transform(X_resampled)

# Use hard voting instead of soft voting
cc = ClusterCentroids(voting='hard')
X_resampled, y_resampled = cc.fit_sample(X, y)
X_res_vis_hard = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
ax2.scatter(X_res_vis_soft[y_resampled == 0, 0],
X_res_vis_soft[y_resampled == 0, 1],
label="Class #0", alpha=.5)
ax2.scatter(X_res_vis_soft[y_resampled == 1, 0],
X_res_vis_soft[y_resampled == 1, 1],
label="Class #1", alpha=.5)
c2 = ax2.scatter(X_vis[y == 1, 0],
X_vis[y == 1, 1], label="Original #1",
alpha=0.2)
ax2.set_title('Cluster centroids with soft voting')

ax3.scatter(X_res_vis_hard[y_resampled == 0, 0],
X_res_vis_hard[y_resampled == 0, 1],
label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
ax3.scatter(X_res_vis_hard[y_resampled == 1, 0],
X_res_vis_hard[y_resampled == 1, 1],
label="Class #1", alpha=.5)
ax2.set_title('Cluster centroids')
ax3.scatter(X_vis[y == 1, 0],
X_vis[y == 1, 1],
alpha=0.2)
ax3.set_title('Cluster centroids with hard voting')

# make nice plotting
for ax in (ax1, ax2):
for ax in (ax1, ax2, ax3):
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
Expand All @@ -62,7 +83,8 @@
ax.set_xlim([-6, 8])
ax.set_ylim([-6, 6])

plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center',
ncol=2, labelspacing=0.)
plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'),
loc='lower center',
ncol=3, labelspacing=0.)
plt.tight_layout(pad=3)
plt.show()
68 changes: 55 additions & 13 deletions imblearn/under_sampling/prototype_generation/cluster_centroids.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
from scipy import sparse

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import safe_indexing

from ..base import BaseUnderSampler

VOTING_KIND = ('auto', 'hard', 'soft')


class ClusterCentroids(BaseUnderSampler):
"""Perform under-sampling by generating centroids based on
Expand Down Expand Up @@ -58,6 +61,18 @@ class ClusterCentroids(BaseUnderSampler):
estimator : object, optional(default=KMeans())
Pass a :class:`sklearn.cluster.KMeans` estimator.

voting : str, optional (default='auto')
Voting strategy to generate the new samples:

- If ``'hard'``, the nearest-neighbors of the centroids found using the
clustering algorithm will be used.
- If ``'soft'``, the centroids found by the clustering algorithm will
be used.
- If ``'auto'``, if the input is sparse, it will default on ``'hard'``
otherwise, ``'soft'`` will be used.

.. versionadded:: 0.3.0

n_jobs : int, optional (default=1)
The number of threads to open if possible.

Expand Down Expand Up @@ -91,10 +106,12 @@ def __init__(self,
ratio='auto',
random_state=None,
estimator=None,
voting='auto',
n_jobs=1):
super(ClusterCentroids, self).__init__(
ratio=ratio, random_state=random_state)
self.estimator = estimator
self.voting = voting
self.n_jobs = n_jobs

def _validate_estimator(self):
Expand All @@ -108,6 +125,22 @@ def _validate_estimator(self):
raise ValueError('`estimator` has to be a KMeans clustering.'
' Got {} instead.'.format(type(self.estimator)))

def _generate_sample(self, X, y, centroids, target_class):
if self.voting_ == 'hard':
nearest_neighbors = NearestNeighbors(n_neighbors=1)
nearest_neighbors.fit(X, y)
indices = nearest_neighbors.kneighbors(centroids,
return_distance=False)
X_new = safe_indexing(X, np.squeeze(indices))
else:
if sparse.issparse(X):
X_new = sparse.csr_matrix(centroids)
else:
X_new = centroids
y_new = np.array([target_class] * centroids.shape[0])

return X_new, y_new

def _sample(self, X, y):
"""Resample the dataset.

Expand All @@ -131,28 +164,37 @@ def _sample(self, X, y):
"""
self._validate_estimator()

idx_under = np.empty((0, ), dtype=int)
centroids, y_resampled = [], []
if self.voting == 'auto':
if sparse.issparse(X):
self.voting_ = 'hard'
else:
self.voting_ = 'soft'
else:
if self.voting in VOTING_KIND:
self.voting_ = self.voting
else:
raise ValueError("'voting' needs to be one of {}. Got {}"
" instead.".format(VOTING_KIND, self.voting))

X_resampled, y_resampled = [], []
for target_class in np.unique(y):
if target_class in self.ratio_.keys():
n_samples = self.ratio_[target_class]
self.estimator_.set_params(**{'n_clusters': n_samples})
self.estimator_.fit(X[y == target_class])
centroids.append(self.estimator_.cluster_centers_)
y_resampled += [target_class] * n_samples

X_new, y_new = self._generate_sample(
X, y, self.estimator_.cluster_centers_, target_class)
X_resampled.append(X_new)
y_resampled.append(y_new)
else:
target_class_indices = np.flatnonzero(y == target_class)
idx_under = np.concatenate(
(idx_under, target_class_indices), axis=0)

X_resampled = np.concatenate((centroids))
X_resampled.append(safe_indexing(X, target_class_indices))
y_resampled.append(safe_indexing(y, target_class_indices))

if sparse.issparse(X):
X_resampled = sparse.vstack([sparse.csr_matrix(X_resampled),
safe_indexing(X, idx_under)])
X_resampled = sparse.vstack(X_resampled)
else:
X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under)))
y_resampled = np.hstack((y_resampled, safe_indexing(y, idx_under)))
X_resampled = np.vstack(X_resampled)
y_resampled = np.hstack(y_resampled)

return X_resampled, np.array(y_resampled)
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from collections import Counter

import numpy as np
from scipy import sparse
from pytest import raises

from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_array_equal

from sklearn.cluster import KMeans

from imblearn.under_sampling import ClusterCentroids
Expand All @@ -23,17 +23,26 @@
R_TOL = 1e-4


def test_fit_sample_check_voting():
cc = ClusterCentroids(random_state=RND_SEED)
cc.fit_sample(X, Y)
assert cc.voting_ == 'soft'
cc = ClusterCentroids(random_state=RND_SEED)
cc.fit_sample(sparse.csr_matrix(X), Y)
assert cc.voting_ == 'hard'


def test_fit_sample_auto():
ratio = 'auto'
cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
X_resampled, y_resampled = cc.fit_sample(X, Y)
X_gt = np.array([[0.06738818, -0.529627],
[0.17901516, 0.69860992],
[0.094035, -2.55298982],
[0.92923648, 0.76103773],
X_gt = np.array([[0.92923648, 0.76103773],
[0.47104475, 0.44386323],
[0.13347175, 0.12167502]])
y_gt = np.array([1, 1, 1, 0, 0, 0])
[0.13347175, 0.12167502],
[0.06738818, -0.529627],
[0.17901516, 0.69860992],
[0.094035, -2.55298982]])
y_gt = np.array([0, 0, 0, 1, 1, 1])
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
assert_array_equal(y_resampled, y_gt)

Expand All @@ -42,16 +51,16 @@ def test_fit_sample_half():
ratio = .5
cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
X_resampled, y_resampled = cc.fit_sample(X, Y)
X_gt = np.array([[0.09125309, -0.85409574],
X_gt = np.array([[0.92923648, 0.76103773],
[0.47104475, 0.44386323],
[0.13347175, 0.12167502],
[0.09125309, -0.85409574],
[0.19220316, 0.32337101],
[0.094035, -2.55298982],
[0.20792588, 1.49407907],
[0.04352327, -0.20515826],
[0.12372842, 0.6536186],
[0.92923648, 0.76103773],
[0.47104475, 0.44386323],
[0.13347175, 0.12167502]])
y_gt = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0])
[0.12372842, 0.6536186]])
y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
assert_array_equal(y_resampled, y_gt)

Expand All @@ -75,21 +84,48 @@ def test_fit_sample_object():
ratio=ratio, random_state=RND_SEED, estimator=cluster)

X_resampled, y_resampled = cc.fit_sample(X, Y)
X_gt = np.array([[0.06738818, -0.529627],
X_gt = np.array([[0.92923648, 0.76103773],
[0.47104475, 0.44386323],
[0.13347175, 0.12167502],
[0.06738818, -0.529627],
[0.17901516, 0.69860992],
[0.094035, -2.55298982],
[0.92923648, 0.76103773],
[0.094035, -2.55298982]])
y_gt = np.array([0, 0, 0, 1, 1, 1])
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
assert_array_equal(y_resampled, y_gt)


def test_fit_hard_voting():
ratio = 'auto'
voting = 'hard'
cluster = KMeans(random_state=RND_SEED)
cc = ClusterCentroids(
ratio=ratio, random_state=RND_SEED, estimator=cluster,
voting=voting)

X_resampled, y_resampled = cc.fit_sample(X, Y)
X_gt = np.array([[0.92923648, 0.76103773],
[0.47104475, 0.44386323],
[0.13347175, 0.12167502]])
y_gt = np.array([1, 1, 1, 0, 0, 0])
[0.13347175, 0.12167502],
[0.09125309, -0.85409574],
[0.12372842, 0.6536186],
[0.094035, -2.55298982]])
y_gt = np.array([0, 0, 0, 1, 1, 1])
assert_allclose(X_resampled, X_gt, rtol=R_TOL)
assert_array_equal(y_resampled, y_gt)
for x in X_resampled:
assert np.any(np.all(x == X, axis=1))


def test_fit_sample_wrong_object():
def test_fit_sample_error():
ratio = 'auto'
cluster = 'rnd'
cc = ClusterCentroids(
ratio=ratio, random_state=RND_SEED, estimator=cluster)
with raises(ValueError, match="has to be a KMeans clustering"):
cc.fit_sample(X, Y)

voting = 'unknown'
cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED)
with raises(ValueError, match="needs to be one of"):
cc.fit_sample(X, Y)
1 change: 1 addition & 0 deletions imblearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def check_samplers_sparse(name, Sampler):
elif isinstance(Sampler(), ClusterCentroids):
# set KMeans to full since it support sparse and dense
samplers = [Sampler(random_state=0,
voting='soft',
estimator=KMeans(random_state=1,
algorithm='full'))]
else:
Expand Down