From a68e8eb00e52a5698b2c61804fec015d143a3c54 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 12 Aug 2017 13:01:11 +0200 Subject: [PATCH 01/28] EHN POC sparse handling for RandomUnderSampler --- imblearn/base.py | 21 +++---- imblearn/over_sampling/base.py | 58 +++++++++++++++++++ .../random_under_sampler.py | 25 +++----- 3 files changed, 77 insertions(+), 27 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index af3d0536d..08b1b6adf 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -38,24 +38,25 @@ def sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {array-like, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : array-like, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'ratio_') self._check_X_y(X, y) @@ -70,7 +71,7 @@ def fit_sample(self, X, y): X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : ndarray, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -78,7 +79,7 @@ def fit_sample(self, X, y): X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ @@ -138,10 +139,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -150,7 +151,7 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) # self.sampling_type is already checked in check_ratio diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 9c1f6d51b..175ee4e47 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -5,6 +5,8 @@ # Christos Aridas # License: MIT +from sklearn.utils import check_X_y + from ..base import BaseSampler @@ -16,3 +18,59 @@ class BaseOverSampler(BaseSampler): """ _sampling_type = 'over-sampling' + + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : array-like, shape (n_samples,) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + Notes + ----- + Over-samplers do not accept sparse matrices. + + """ + # over-sampling method does not handle sparse matrix + X, y = check_X_y(X, y) + + return super(BaseOverSampler, self).fit(X, y) + + def sample(self, X, y): + """Resample the dataset. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : array-like, shape (n_samples,) + Corresponding label for each sample in X. + + Returns + ------- + X_resampled : array-like, shape (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : array-like, shape (n_samples_new,) + The corresponding label of `X_resampled` + + Notes + ----- + Over-samplers do not accept sparse matrices. + + """ + + # Check the consistency of X and y + X, y = check_X_y(X, y) + + return super(BaseOverSampler, self).sample(X, y) diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index 9fa242363..5adfb8055 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -7,7 +7,7 @@ from __future__ import division import numpy as np -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from ..base import BaseUnderSampler @@ -110,10 +110,7 @@ def _sample(self, X, y): """ random_state = check_random_state(self.random_state) - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): @@ -125,18 +122,12 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) From 0062d6d6c90119628b0abab51e9afa36a79db46f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 12 Aug 2017 13:49:11 +0200 Subject: [PATCH 02/28] EHN support sparse ENN --- .../edited_nearest_neighbours.py | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py index 5d5011542..afc6cdbc9 100644 --- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py @@ -14,6 +14,8 @@ import numpy as np from scipy.stats import mode +from sklearn.utils import safe_indexing + from ..base import BaseCleaningSampler from ...utils import check_neighbors_object from ...utils.deprecation import deprecate_parameter @@ -167,20 +169,20 @@ def _sample(self, X, y): """ self._validate_estimator() - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) self.nn_.fit(X) for target_class in np.unique(y): if target_class in self.ratio_.keys(): - X_class = X[y == target_class] - y_class = y[y == target_class] + target_class_indices = np.flatnonzero(y == target_class) + X_class = safe_indexing(X, target_class_indices) + y_class = safe_indexing(y, target_class_indices) + print(target_class_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] + print(nnhood_idx) if self.kind_sel == 'mode': nnhood_label, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label) == y_class @@ -191,21 +193,15 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): From 6197d80539fa557e21edeb7fa3b80d7426892ae6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 12 Aug 2017 15:53:00 +0200 Subject: [PATCH 03/28] iter --- .../prototype_selection/edited_nearest_neighbours.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py index afc6cdbc9..aa7fda65c 100644 --- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py @@ -178,11 +178,9 @@ def _sample(self, X, y): target_class_indices = np.flatnonzero(y == target_class) X_class = safe_indexing(X, target_class_indices) y_class = safe_indexing(y, target_class_indices) - print(target_class_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] - print(nnhood_idx) if self.kind_sel == 'mode': nnhood_label, _ = mode(nnhood_label, axis=1) nnhood_bool = np.ravel(nnhood_label) == y_class From f6698433250323bbe7a401f3e6e341092267c5d7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 12 Aug 2017 16:35:27 +0200 Subject: [PATCH 04/28] EHN sparse indexing IHT --- .../instance_hardness_threshold.py | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py index 7de2f9cdb..551751599 100644 --- a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py +++ b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py @@ -16,6 +16,7 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import RandomForestClassifier from sklearn.externals.six import string_types +from sklearn.utils import safe_indexing from ..base import BaseCleaningSampler @@ -219,8 +220,10 @@ def _sample(self, X, y): probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: - X_train, X_test = X[train_index], X[test_index] - y_train, y_test = y[train_index], y[test_index] + X_train = safe_indexing(X, train_index) + X_test = safe_indexing(X, test_index) + y_train = safe_indexing(y, train_index) + y_test = safe_indexing(y, test_index) self.estimator_.fit(X_train, y_train) @@ -231,10 +234,7 @@ def _sample(self, X, y): for l, c in enumerate(y_test) ] - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): @@ -247,18 +247,12 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) From 4adc6dbe3c3e7dfb6b65a3855496398978c85b37 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 12 Aug 2017 16:50:33 +0200 Subject: [PATCH 05/28] EHN sparse support nearmiss --- .../prototype_selection/nearmiss.py | 43 +++++++++---------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/imblearn/under_sampling/prototype_selection/nearmiss.py b/imblearn/under_sampling/prototype_selection/nearmiss.py index c7622ee67..2c5771397 100644 --- a/imblearn/under_sampling/prototype_selection/nearmiss.py +++ b/imblearn/under_sampling/prototype_selection/nearmiss.py @@ -11,6 +11,8 @@ import numpy as np +from sklearn.utils import safe_indexing + from ..base import BaseUnderSampler from ...utils import check_neighbors_object from ...utils.deprecation import deprecate_parameter @@ -181,7 +183,9 @@ def _selection_dist_based(self, # Compute the distance considering the farthest neighbour dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors:], axis=1) - if dist_vec.shape[0] != X[y == key].shape[0]: + target_class_indices = np.flatnonzero(y == key) + if (dist_vec.shape[0] != safe_indexing(X, + target_class_indices).shape[0]): raise RuntimeError('The samples to be selected do not correspond' ' to the distance matrix given. Ensure that' ' both `X[y == key]` and `dist_vec` are' @@ -257,21 +261,20 @@ def _sample(self, X, y): """ self._validate_estimator() - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) + minority_class_indices = np.flatnonzero(y == class_minority) - self.nn_.fit(X[y == class_minority]) + self.nn_.fit(safe_indexing(X, minority_class_indices)) for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] - X_class = X[y == target_class] - y_class = y[y == target_class] + target_class_indices = np.flatnonzero(y == target_class) + X_class = safe_indexing(X, target_class_indices) + y_class = safe_indexing(y, target_class_indices) if self.version == 1: dist_vec, idx_vec = self.nn_.kneighbors( @@ -288,10 +291,10 @@ def _sample(self, X, y): elif self.version == 3: self.nn_ver3_.fit(X_class) dist_vec, idx_vec = self.nn_ver3_.kneighbors( - X[y == class_minority]) + safe_indexing(X, minority_class_indices)) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) - X_class_selected = X_class[idx_vec_farthest, :] - y_class_selected = y_class[idx_vec_farthest] + X_class_selected = safe_indexing(X_class, idx_vec_farthest) + y_class_selected = safe_indexing(y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors) @@ -304,18 +307,12 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) From bba7835ce56b3b8118825d6c9dff48373b1d6501 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 13 Aug 2017 19:35:37 +0200 Subject: [PATCH 06/28] EHN support sparse matrices for NCR --- .../neighbourhood_cleaning_rule.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py index 45d19e34b..29ac7b6cf 100644 --- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py @@ -11,6 +11,8 @@ import numpy as np from scipy.stats import mode +from sklearn.utils import safe_indexing + from ..base import BaseCleaningSampler from .edited_nearest_neighbours import EditedNearestNeighbours from ...utils import check_neighbors_object @@ -187,8 +189,9 @@ def _sample(self, X, y): (n_samples > X.shape[0] * self.threshold_cleaning))] self.nn_.fit(X) - X_class = X[y == class_minority] - y_class = y[y == class_minority] + class_minority_indices = y == class_minority + X_class = safe_indexing(X, class_minority_indices) + y_class = safe_indexing(y, class_minority_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] @@ -210,6 +213,15 @@ def _sample(self, X, y): selected_samples[union_a1_a2] = False index_target_class = np.flatnonzero(selected_samples) + if self.return_indices: + return (safe_indexing(X, index_target_class), + safe_indexing(y, index_target_class), + index_target_class) + else: + return (safe_indexing(X, index_target_class), + safe_indexing(y, index_target_class)) + + if self.return_indices: return (X[index_target_class], y[index_target_class], index_target_class) From 9cd917b7b606d980d3cabae92fd97806c7cc8bde Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 13 Aug 2017 20:14:33 +0200 Subject: [PATCH 07/28] EHN support sparse Tomek and OSS --- .../neighbourhood_cleaning_rule.py | 2 +- .../one_sided_selection.py | 73 +++++++------------ .../prototype_selection/tomek_links.py | 10 ++- 3 files changed, 36 insertions(+), 49 deletions(-) diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py index 29ac7b6cf..e228d6794 100644 --- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py @@ -189,7 +189,7 @@ def _sample(self, X, y): (n_samples > X.shape[0] * self.threshold_cleaning))] self.nn_.fit(X) - class_minority_indices = y == class_minority + class_minority_indices = np.flatnonzero(y == class_minority) X_class = safe_indexing(X, class_minority_indices) y_class = safe_indexing(y, class_minority_indices) nnhood_idx = self.nn_.kneighbors( diff --git a/imblearn/under_sampling/prototype_selection/one_sided_selection.py b/imblearn/under_sampling/prototype_selection/one_sided_selection.py index 1545300a4..698d634aa 100644 --- a/imblearn/under_sampling/prototype_selection/one_sided_selection.py +++ b/imblearn/under_sampling/prototype_selection/one_sided_selection.py @@ -10,7 +10,7 @@ import numpy as np from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from ..base import BaseCleaningSampler from .tomek_links import TomekLinks @@ -174,10 +174,7 @@ def _sample(self, X, y): target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): @@ -186,56 +183,42 @@ def _sample(self, X, y): idx_maj_sample = idx_maj[random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S)] - maj_sample = X[idx_maj_sample] + + minority_class_indices = np.flatnonzero(y == class_minority) + C_indices = np.append(minority_class_indices, idx_maj_sample) # create the set composed of all minority samples and one # sample from the current class. - C_x = np.append(X[y == class_minority], maj_sample, axis=0) - C_y = np.append(y[y == class_minority], [target_class] * - self.n_seeds_S) + C_x = safe_indexing(X, C_indices) + C_y = safe_indexing(y, C_indices) # create the set S with removing the seed from S # since that it will be added anyway idx_maj_extracted = np.delete(idx_maj, idx_maj_sample, axis=0) - S_x = X[idx_maj_extracted] - S_y = y[idx_maj_extracted] + S_x = safe_indexing(X, idx_maj_extracted) + S_y = safe_indexing(y, idx_maj_extracted) self.estimator_.fit(C_x, C_y) pred_S_y = self.estimator_.predict(S_x) - sel_x = S_x[np.flatnonzero(pred_S_y != S_y), :] - sel_y = S_y[np.flatnonzero(pred_S_y != S_y)] - if self.return_indices: - idx_tmp = idx_maj_extracted[ - np.flatnonzero(pred_S_y != S_y)] - idx_under = np.concatenate( - (idx_under, idx_maj_sample, idx_tmp), axis=0) - X_resampled = np.concatenate( - (X_resampled, maj_sample, sel_x), axis=0) - y_resampled = np.concatenate( - (y_resampled, [target_class] * self.n_seeds_S, sel_y), - axis=0) + S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) + idx_tmp = idx_maj_extracted[S_misclassified_indices] + idx_under = np.concatenate( + (idx_under, idx_maj_sample, idx_tmp), axis=0) else: - X_resampled = np.concatenate( - (X_resampled, X[y == target_class]), axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class]), axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)), axis=0) - - # find the nearest neighbour of every point - nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) - nn.fit(X_resampled) - nns = nn.kneighbors(X_resampled, return_distance=False)[:, 1] - - links = TomekLinks.is_tomek(y_resampled, nns, - [c for c in np.unique(y) - if (c != class_minority and - c in self.ratio_.keys())]) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)), axis=0) + + X_resampled = safe_indexing(X, idx_under) + y_resampled = safe_indexing(y, idx_under) + + # apply Tomek cleaning + tl = TomekLinks(ratio='not minority', return_indices=True, + random_state=self.random_state) + X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample(X_resampled, + y_resampled) + + idx_under = safe_indexing(idx_under, idx_cleaned) if self.return_indices: - return (X_resampled[np.logical_not(links)], - y_resampled[np.logical_not(links)], - idx_under[np.logical_not(links)]) + return (X_cleaned, y_cleaned, idx_under) else: - return (X_resampled[np.logical_not(links)], - y_resampled[np.logical_not(links)]) + return X_cleaned, y_cleaned diff --git a/imblearn/under_sampling/prototype_selection/tomek_links.py b/imblearn/under_sampling/prototype_selection/tomek_links.py index dba47ccb9..8d8a50067 100644 --- a/imblearn/under_sampling/prototype_selection/tomek_links.py +++ b/imblearn/under_sampling/prototype_selection/tomek_links.py @@ -9,6 +9,7 @@ import numpy as np from sklearn.neighbors import NearestNeighbors +from sklearn.utils import safe_indexing from ..base import BaseCleaningSampler @@ -169,9 +170,12 @@ def _sample(self, X, y): nns = nn.kneighbors(X, return_distance=False)[:, 1] links = self.is_tomek(y, nns, self.ratio_) + idx_under = np.flatnonzero(np.logical_not(links)) if self.return_indices: - return (X[np.logical_not(links)], y[np.logical_not(links)], - np.flatnonzero(np.logical_not(links))) + return (safe_indexing(X, idx_under), + safe_indexing(y, idx_under), + idx_under) else: - return X[np.logical_not(links)], y[np.logical_not(links)] + return (safe_indexing(X, idx_under), + safe_indexing(y, idx_under)) From c3ba30752338d0a348cb2b5ac2d4beac1318d8e5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 13 Aug 2017 21:31:23 +0200 Subject: [PATCH 08/28] EHN support sparsity for CNN --- .../condensed_nearest_neighbour.py | 79 ++++++++----------- .../neighbourhood_cleaning_rule.py | 10 +-- .../one_sided_selection.py | 2 +- 3 files changed, 33 insertions(+), 58 deletions(-) diff --git a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py index f7115176f..c3f743647 100644 --- a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py @@ -10,8 +10,11 @@ from collections import Counter import numpy as np + +from scipy.sparse import issparse + from sklearn.neighbors import KNeighborsClassifier -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from ..base import BaseCleaningSampler from ...utils.deprecation import deprecate_parameter @@ -179,29 +182,27 @@ def _sample(self, X, y): random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): # Randomly get one sample from the majority class # Generate the index to select - idx_maj_sample = random_state.randint( - low=0, high=target_stats[target_class], - size=self.n_seeds_S) - maj_sample = X[y == target_class][idx_maj_sample] + idx_maj = np.flatnonzero(y == target_class) + idx_maj_sample = idx_maj[random_state.randint( + low=0, high=target_stats[target_class], + size=self.n_seeds_S)] # Create the set C - One majority samples and all minority - C_x = np.append(X[y == class_minority], maj_sample, axis=0) - C_y = np.append(y[y == class_minority], - np.array([target_class] * self.n_seeds_S)) + C_indices = np.append(np.flatnonzero(y == class_minority), + idx_maj_sample) + C_x = safe_indexing(X, C_indices) + C_y = safe_indexing(y, C_indices) # Create the set S - all majority samples - S_x = X[y == target_class] - S_y = y[y == target_class] + S_indices = np.flatnonzero(y == target_class) + S_x = safe_indexing(X, S_indices) + S_y = safe_indexing(y, S_indices) # fit knn on C self.estimator_.fit(C_x, C_y) @@ -215,21 +216,21 @@ def _sample(self, X, y): continue # Classify on S - pred_y = self.estimator_.predict(x_sam.reshape(1, -1)) + if not issparse(x_sam): + x_sam = x_sam.reshape(1, -1) + pred_y = self.estimator_.predict(x_sam) # If the prediction do not agree with the true label # append it in C_x if y_sam != pred_y: # Keep the index for later - idx_maj_sample = np.append(idx_maj_sample, idx_sam) + idx_maj_sample = np.append(idx_maj_sample, + idx_maj[idx_sam]) # Update C - C_x = np.append(X[y == class_minority], - X[y == target_class][idx_maj_sample], - axis=0) - C_y = np.append(y[y == class_minority], - np.array([target_class] * - idx_maj_sample.size)) + C_indices = np.append(C_indices, idx_maj[idx_sam]) + C_x = safe_indexing(X, C_indices) + C_y = safe_indexing(y, C_indices) # fit a knn on C self.estimator_.fit(C_x, C_y) @@ -242,32 +243,14 @@ def _sample(self, X, y): np.append(idx_maj_sample, np.flatnonzero(pred_S_y == S_y))) - # Find the misclassified S_y - sel_x = S_x[idx_maj_sample, :] - sel_y = S_y[idx_maj_sample] - - # The indexes found are relative to the current class, we need - # to find the absolute value Build the array with the absolute - # position - abs_pos = np.flatnonzero(y == target_class) - idx_maj_sample = abs_pos[idx_maj_sample] - - # If we need to offer support for the indices selected - if self.return_indices: - idx_under = np.concatenate((idx_under, idx_maj_sample), - axis=0) - X_resampled = np.concatenate((X_resampled, sel_x), axis=0) - y_resampled = np.concatenate((y_resampled, sel_y), axis=0) + idx_under = np.concatenate((idx_under, idx_maj_sample), + axis=0) else: - X_resampled = np.concatenate( - (X_resampled, X[y == target_class]), axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class]), axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py index e228d6794..7bd31bb71 100644 --- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py @@ -15,7 +15,7 @@ from ..base import BaseCleaningSampler from .edited_nearest_neighbours import EditedNearestNeighbours -from ...utils import check_neighbors_object +from ...utils import check_neighbors_object, check_ratio SEL_KIND = ('all', 'mode') @@ -168,7 +168,6 @@ def _sample(self, X, y): """ self._validate_estimator() - enn = EditedNearestNeighbours(ratio=self.ratio, return_indices=True, random_state=self.random_state, size_ngh=self.size_ngh, @@ -220,10 +219,3 @@ def _sample(self, X, y): else: return (safe_indexing(X, index_target_class), safe_indexing(y, index_target_class)) - - - if self.return_indices: - return (X[index_target_class], y[index_target_class], - index_target_class) - else: - return X[index_target_class], y[index_target_class] diff --git a/imblearn/under_sampling/prototype_selection/one_sided_selection.py b/imblearn/under_sampling/prototype_selection/one_sided_selection.py index 698d634aa..ebba708ab 100644 --- a/imblearn/under_sampling/prototype_selection/one_sided_selection.py +++ b/imblearn/under_sampling/prototype_selection/one_sided_selection.py @@ -212,7 +212,7 @@ def _sample(self, X, y): y_resampled = safe_indexing(y, idx_under) # apply Tomek cleaning - tl = TomekLinks(ratio='not minority', return_indices=True, + tl = TomekLinks(ratio=self.ratio_, return_indices=True, random_state=self.random_state) X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample(X_resampled, y_resampled) From d195868e3139b89a0327318eb44403cb03bd1ea9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 13 Aug 2017 23:42:09 +0200 Subject: [PATCH 09/28] EHN support sparse for SMOTE --- imblearn/over_sampling/base.py | 80 +++++------ imblearn/over_sampling/random_over_sampler.py | 20 ++- imblearn/over_sampling/smote.py | 128 ++++++++++++------ 3 files changed, 138 insertions(+), 90 deletions(-) diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 175ee4e47..50efc6e74 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -19,58 +19,58 @@ class BaseOverSampler(BaseSampler): _sampling_type = 'over-sampling' - def fit(self, X, y): - """Find the classes statistics before to perform sampling. + # def fit(self, X, y): + # """Find the classes statistics before to perform sampling. - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. + # Parameters + # ---------- + # X : array-like, shape (n_samples, n_features) + # Matrix containing the data which have to be sampled. - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. + # y : array-like, shape (n_samples,) + # Corresponding label for each sample in X. - Returns - ------- - self : object, - Return self. + # Returns + # ------- + # self : object, + # Return self. - Notes - ----- - Over-samplers do not accept sparse matrices. + # Notes + # ----- + # Over-samplers do not accept sparse matrices. - """ - # over-sampling method does not handle sparse matrix - X, y = check_X_y(X, y) + # """ + # # over-sampling method does not handle sparse matrix + # X, y = check_X_y(X, y) - return super(BaseOverSampler, self).fit(X, y) + # return super(BaseOverSampler, self).fit(X, y) - def sample(self, X, y): - """Resample the dataset. + # def sample(self, X, y): + # """Resample the dataset. - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. + # Parameters + # ---------- + # X : array-like, shape (n_samples, n_features) + # Matrix containing the data which have to be sampled. - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. + # y : array-like, shape (n_samples,) + # Corresponding label for each sample in X. - Returns - ------- - X_resampled : array-like, shape (n_samples_new, n_features) - The array containing the resampled data. + # Returns + # ------- + # X_resampled : array-like, shape (n_samples_new, n_features) + # The array containing the resampled data. - y_resampled : array-like, shape (n_samples_new,) - The corresponding label of `X_resampled` + # y_resampled : array-like, shape (n_samples_new,) + # The corresponding label of `X_resampled` - Notes - ----- - Over-samplers do not accept sparse matrices. + # Notes + # ----- + # Over-samplers do not accept sparse matrices. - """ + # """ - # Check the consistency of X and y - X, y = check_X_y(X, y) + # # Check the consistency of X and y + # X, y = check_X_y(X, y) - return super(BaseOverSampler, self).sample(X, y) + # return super(BaseOverSampler, self).sample(X, y) diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index 9b164eee7..d70b45d32 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -8,7 +8,7 @@ from collections import Counter import numpy as np -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from .base import BaseOverSampler @@ -102,19 +102,15 @@ def _sample(self, X, y): random_state = check_random_state(self.random_state) target_stats = Counter(y) - X_resampled = X.copy() - y_resampled = y.copy() + sample_indices = range(X.shape[0]) for class_sample, num_samples in self.ratio_.items(): - index_samples = random_state.randint( + target_class_indices = np.flatnonzero(y == class_sample) + indices = random_state.randint( low=0, high=target_stats[class_sample], size=num_samples) - X_resampled = np.concatenate((X_resampled, - X[y == class_sample][index_samples]), - axis=0) + sample_indices = np.append(sample_indices, + target_class_indices[indices]) - y_resampled = np.concatenate((y_resampled, - y[y == class_sample][index_samples]), - axis=0) - - return X_resampled, y_resampled + return (safe_indexing(X, sample_indices), + safe_indexing(y, sample_indices)) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 7902d178d..c1a8d7477 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -8,8 +8,11 @@ from __future__ import division import numpy as np + +from scipy import sparse + from sklearn.svm import SVC -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from .base import BaseOverSampler from ..exceptions import raise_isinstance_error @@ -253,18 +256,34 @@ def _make_samples(self, """ random_state = check_random_state(self.random_state) - X_new = np.zeros((n_samples, X.shape[1])) - samples = random_state.randint( + samples_indices = random_state.randint( low=0, high=len(nn_num.flatten()), size=n_samples) steps = step_size * random_state.uniform(size=n_samples) - rows = np.floor_divide(samples, nn_num.shape[1]) - cols = np.mod(samples, nn_num.shape[1]) - for i, (sample, row, col, step) in enumerate(zip(samples, rows, - cols, steps)): - X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) - y_new = np.array([y_type] * len(X_new)) + rows = np.floor_divide(samples_indices, nn_num.shape[1]) + cols = np.mod(samples_indices, nn_num.shape[1]) + + if sparse.issparse(X): + row_indices, col_indices, samples = [], [], [] + for i, (row, col, step) in enumerate(zip(rows, cols, steps)): + if X[row].nnz: + sample = X[row] - step * (X[row] - + nn_data[nn_num[row, col]]) + row_indices += [i] * len(sample.indices) + col_indices += sample.indices.tolist() + samples += sample.data.tolist() + else: + X_new = np.zeros((n_samples, X.shape[1])) + for i, (row, col, step) in enumerate(zip(rows, cols, steps)): + X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) - return X_new, y_new + y_new = np.array([y_type] * len(samples_indices)) + + if sparse.issparse(X): + return (sparse.csr_matrix((samples, (row_indices, col_indices)), + [len(samples_indices), X.shape[1]]), + y_new) + else: + return X_new, y_new def _validate_estimator(self): """Create the necessary objects for SMOTE.""" @@ -326,21 +345,26 @@ def _sample_regular(self, X, y): intelligence research, 321-357, 2002. """ + X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, class_sample, X_class, nns, n_samples, 1.0) - X_resampled = np.concatenate((X_resampled, X_new), axis=0) - y_resampled = np.concatenate((y_resampled, y_new), axis=0) + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled @@ -381,7 +405,8 @@ def _sample_borderline(self, X, y): for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.nn_m_.fit(X) danger_index = self._in_danger_noise(X_class, class_sample, y, @@ -391,16 +416,21 @@ def _sample_borderline(self, X, y): self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors( - X_class[danger_index], return_distance=False)[:, 1:] + safe_indexing(X_class, danger_index), + return_distance=False)[:, 1:] # divergence between borderline-1 and borderline-2 if self.kind == 'borderline1': # Create synthetic samples for borderline points. - X_new, y_new = self._make_samples(X_class[danger_index], + X_new, y_new = self._make_samples(safe_indexing(X_class, + danger_index), class_sample, X_class, nns, n_samples) - X_resampled = np.concatenate((X_resampled, X_new), axis=0) - y_resampled = np.concatenate((y_resampled, y_new), axis=0) + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) else: random_state = check_random_state(self.random_state) @@ -408,22 +438,26 @@ def _sample_borderline(self, X, y): # only minority X_new_1, y_new_1 = self._make_samples( - X_class[danger_index], class_sample, X_class, nns, + safe_indexing(X_class, danger_index), class_sample, + X_class, nns, int(fractions * (n_samples + 1)), step_size=1.) # we use a one-vs-rest policy to handle the multiclass in which # new samples will be created considering not only the majority # class but all over classes. X_new_2, y_new_2 = self._make_samples( - X_class[danger_index], class_sample, X[y != class_sample], + safe_indexing(X_class, danger_index), class_sample, + safe_indexing(X, np.flatnonzero(y != class_sample)), nns, int((1 - fractions) * n_samples), step_size=0.5) - # Concatenate the newly generated samples to the original - # data set - X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), - axis=0) - y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), - axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, + X_new_1, X_new_2]) + else: + X_resampled = np.vstack((X_resampled, + X_new_1, X_new_2)) + y_resampled = np.hstack((y_resampled, + y_new_1, y_new_2)) return X_resampled, y_resampled @@ -463,17 +497,20 @@ def _sample_svm(self, X, y): for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) support_index = self.svm_estimator_.support_[ y[self.svm_estimator_.support_] == class_sample] - support_vector = X[support_index] + support_vector = safe_indexing(X, support_index) self.nn_m_.fit(X) noise_bool = self._in_danger_noise(support_vector, class_sample, y, kind='noise') - support_vector = support_vector[np.logical_not(noise_bool)] + support_vector = safe_indexing( + support_vector, + np.flatnonzero(np.logical_not(noise_bool))) danger_bool = self._in_danger_noise(support_vector, class_sample, y, kind='danger') safety_bool = np.logical_not(danger_bool) @@ -481,33 +518,48 @@ def _sample_svm(self, X, y): self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) if np.count_nonzero(danger_bool) > 0: - nns = self.nn_k_.kneighbors(support_vector[danger_bool], + nns = self.nn_k_.kneighbors(safe_indexing( + support_vector, + np.flatnonzero(danger_bool)), return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( - support_vector[danger_bool], class_sample, X_class, + safe_indexing(support_vector, np.flatnonzero(danger_bool)), + class_sample, X_class, nns, int(fractions * (n_samples + 1)), step_size=1.) if np.count_nonzero(safety_bool) > 0: - nns = self.nn_k_.kneighbors(support_vector[safety_bool], - return_distance=False)[:, 1:] + nns = self.nn_k_.kneighbors( + safe_indexing(support_vector, np.flatnonzero(safety_bool)), + return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( - support_vector[safety_bool], class_sample, X_class, + safe_indexing(support_vector, np.flatnonzero(safety_bool)), + class_sample, X_class, nns, int((1 - fractions) * n_samples), step_size=-self.out_step) if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): - X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), - axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, + X_new_1, X_new_2]) + else: + X_resampled = np.vstack((X_resampled, + X_new_1, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), axis=0) elif np.count_nonzero(danger_bool) == 0: - X_resampled = np.concatenate((X_resampled, X_new_2), axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, X_new_2]) + else: + X_resampled = np.vstack((X_resampled, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) elif np.count_nonzero(safety_bool) == 0: - X_resampled = np.concatenate((X_resampled, X_new_1), axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, X_new_1]) + else: + X_resampled = np.vstack((X_resampled, X_new_1)) y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) return X_resampled, y_resampled From bcf44ab3e3d22a3660ba59e50ec2e7933b8036bb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 00:12:47 +0200 Subject: [PATCH 10/28] EHN support sparse adasyn --- imblearn/over_sampling/adasyn.py | 69 +++++++++++++++------ imblearn/over_sampling/tests/test_adasyn.py | 29 +-------- 2 files changed, 54 insertions(+), 44 deletions(-) diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index 3f16d0d53..e5c439b61 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -7,7 +7,9 @@ from __future__ import division import numpy as np -from sklearn.utils import check_random_state +from scipy import sparse + +from sklearn.utils import check_random_state, safe_indexing from .base import BaseOverSampler from ..utils import check_neighbors_object @@ -154,7 +156,8 @@ def _sample(self, X, y): for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.nn_.fit(X) _, nn_index = self.nn_.kneighbors(X_class) @@ -171,27 +174,57 @@ def _sample(self, X, y): ' Use SMOTE instead.') ratio_nn /= np.sum(ratio_nn) n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) + if not np.sum(n_samples_generate): + raise ValueError("No samples will be generated with the" + " provided ratio settings.") # the nearest neighbors need to be fitted only on the current class # to find the class NN to generate new samples self.nn_.fit(X_class) _, nn_index = self.nn_.kneighbors(X_class) - x_class_gen = [] - for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, - n_samples_generate): - if num_sample_i == 0: - continue - nn_zs = random_state.randint( - 1, high=self.nn_.n_neighbors, size=num_sample_i) - steps = random_state.uniform(size=len(nn_zs)) - x_class_gen.append([x_i + step * (X[x_i_nn[nn_z], :] - x_i) - for step, nn_z in zip(steps, nn_zs)]) - - if len(x_class_gen) > 0: - X_resampled = np.vstack((X_resampled, - np.concatenate(x_class_gen))) - y_resampled = np.hstack((y_resampled, [class_sample] * - np.sum(n_samples_generate))) + if sparse.issparse(X): + row_indices, col_indices, samples = [], [], [] + n_samples_generated = 0 + for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, + n_samples_generate): + if num_sample_i == 0: + continue + nn_zs = random_state.randint( + 1, high=self.nn_.n_neighbors, size=num_sample_i) + steps = random_state.uniform(size=len(nn_zs)) + if x_i.nnz: + for step, nn_z in zip(steps, nn_zs): + sample = x_i + step * (X[x_i_nn[nn_z], :] - x_i) + row_indices += ([n_samples_generated] * + len(sample.indices)) + col_indices += sample.indices.tolist() + samples += sample.data.tolist() + n_samples_generated += 1 + X_new = (sparse.csr_matrix((samples, + (row_indices, col_indices)), + [np.sum(n_samples_generate), + X.shape[1]])) + y_new = np.array([class_sample] * np.sum(n_samples_generate)) + else: + x_class_gen = [] + for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, + n_samples_generate): + if num_sample_i == 0: + continue + nn_zs = random_state.randint( + 1, high=self.nn_.n_neighbors, size=num_sample_i) + steps = random_state.uniform(size=len(nn_zs)) + x_class_gen.append([x_i + step * (X[x_i_nn[nn_z], :] - x_i) + for step, nn_z in zip(steps, nn_zs)]) + + X_new = np.concatenate(x_class_gen) + y_new = np.array([class_sample] * np.sum(n_samples_generate)) + + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index eb68dd06c..663c60af2 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -73,34 +73,11 @@ def test_ada_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_ada_fit_sample_half(): +def test_ada_fit_ratio_error(): ratio = 0.8 ada = ADASYN(ratio=ratio, random_state=RND_SEED) - X_resampled, y_resampled = ada.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234]]) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) + assert_raises_regex(ValueError, "No samples will be generated.", + ada.fit_sample, X, Y) def test_ada_fit_sample_nn_obj(): From c405aa997c1bc299b7454e6a22cc9d760b216e28 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 00:19:09 +0200 Subject: [PATCH 11/28] EHN support sparsity for sombine methods --- imblearn/combine/smote_enn.py | 2 +- imblearn/combine/smote_tomek.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 32ce1e49d..7a779e127 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -293,7 +293,7 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 08c9f20fd..af55f8b4f 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -244,7 +244,7 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) From 79637d78ec439ea69161320a80c96ad72bb04b0d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 01:48:22 +0200 Subject: [PATCH 12/28] EHN support sparsity BC --- imblearn/ensemble/balance_cascade.py | 60 ++++++++++------------------ 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index f88c873ed..8adda9004 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -12,7 +12,7 @@ from sklearn.base import ClassifierMixin from sklearn.neighbors import KNeighborsClassifier -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from sklearn.externals.six import string_types from sklearn.model_selection import cross_val_predict @@ -249,22 +249,16 @@ def _sample(self, X, y): samples_mask = np.ones(y.shape, dtype=bool) # where the different set will be stored - X_resampled = [] - y_resampled = [] idx_under = [] n_subsets = 0 b_subset_search = True while b_subset_search: - target_stats = Counter(y[samples_mask]) - # build the data set to be classified - X_subset = np.empty((0, X.shape[1]), dtype=X.dtype) - y_subset = np.empty((0, ), dtype=y.dtype) + target_stats = Counter(safe_indexing( + y, np.flatnonzero(samples_mask))) # store the index of the data to under-sample index_under_sample = np.empty((0, ), dtype=y.dtype) # value which will be picked at each round - X_constant = np.empty((0, X.shape[1]), dtype=X.dtype) - y_constant = np.empty((0, ), dtype=y.dtype) index_constant = np.empty((0, ), dtype=y.dtype) for target_class in target_stats.keys(): if target_class in self.ratio_.keys(): @@ -274,29 +268,15 @@ def _sample(self, X, y): index_class = np.flatnonzero(y == target_class) index_class_interest = index_class[samples_mask[ y == target_class]] - X_class = X[index_class_interest] - y_class = y[index_class_interest] + y_class = safe_indexing(y, index_class_interest) # select randomly the desired features index_target_class = random_state.choice( range(y_class.size), size=n_samples, replace=False) - X_subset = np.concatenate((X_subset, - X_class[index_target_class]), - axis=0) - y_subset = np.concatenate((y_subset, - y_class[index_target_class]), - axis=0) - # index of the data index_under_sample = np.concatenate( (index_under_sample, index_class_interest[index_target_class]), axis=0) else: - X_constant = np.concatenate((X_constant, - X[y == target_class]), - axis=0) - y_constant = np.concatenate((y_constant, - y[y == target_class]), - axis=0) index_constant = np.concatenate( (index_constant, np.flatnonzero(y == target_class)), @@ -304,23 +284,18 @@ def _sample(self, X, y): # store the set created n_subsets += 1 - X_resampled.append(np.concatenate((X_subset, X_constant), - axis=0)) - y_resampled.append(np.concatenate((y_subset, y_constant), - axis=0)) - idx_under.append(np.concatenate((index_under_sample, - index_constant), - axis=0)) + subset_indices = np.concatenate((index_under_sample, + index_constant), axis=0) + idx_under.append(subset_indices) # fit and predict using cross validation - pred = cross_val_predict(self.estimator_, - np.concatenate((X_subset, X_constant), - axis=0), - np.concatenate((y_subset, y_constant), - axis=0)) + X_subset = safe_indexing(X, subset_indices) + y_subset = safe_indexing(y, subset_indices) + pred = cross_val_predict(self.estimator_, X_subset, y_subset) # extract the prediction about the targeted classes only - pred_target = pred[:y_subset.size] - index_classified = index_under_sample[pred_target == y_subset] + pred_target = pred[:index_under_sample.size] + index_classified = index_under_sample[ + pred_target == y_subset[:index_under_sample.size]] samples_mask[index_classified] = False # check the stopping criterion @@ -328,11 +303,18 @@ def _sample(self, X, y): if n_subsets == self.n_max_subset: b_subset_search = False # check that there is enough samples for another round - target_stats = Counter(y[samples_mask]) + target_stats = Counter(safe_indexing( + y, np.flatnonzero(samples_mask))) for target_class in self.ratio_.keys(): + print(target_stats[target_class], self.ratio_[target_class]) if target_stats[target_class] < self.ratio_[target_class]: b_subset_search = False + X_resampled, y_resampled = [], [] + for indices in idx_under: + X_resampled.append(safe_indexing(X, indices)) + y_resampled.append(safe_indexing(y, indices)) + if self.return_indices: return (np.array(X_resampled), np.array(y_resampled), np.array(idx_under)) From c199af9b8575203a93a62d007e73e514b7745a69 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 02:47:00 +0200 Subject: [PATCH 13/28] DOC update docstring --- imblearn/base.py | 25 +++++---- imblearn/combine/smote_enn.py | 11 ++-- imblearn/combine/smote_tomek.py | 13 +++-- imblearn/ensemble/balance_cascade.py | 12 ++-- imblearn/ensemble/easy_ensemble.py | 7 ++- imblearn/over_sampling/adasyn.py | 10 ++-- imblearn/over_sampling/base.py | 56 ------------------- imblearn/over_sampling/random_over_sampler.py | 9 +-- imblearn/over_sampling/smote.py | 54 +++++++++--------- .../prototype_generation/cluster_centroids.py | 41 ++++++++------ .../condensed_nearest_neighbour.py | 9 +-- .../edited_nearest_neighbours.py | 27 +++++---- .../instance_hardness_threshold.py | 9 +-- .../prototype_selection/nearmiss.py | 21 +++---- .../neighbourhood_cleaning_rule.py | 9 +-- .../random_under_sampler.py | 11 ++-- .../prototype_selection/tomek_links.py | 9 +-- 17 files changed, 150 insertions(+), 183 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 08b1b6adf..05d79f35a 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -38,7 +38,7 @@ def sample(self, X, y): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) @@ -46,11 +46,11 @@ def sample(self, X, y): Returns ------- - X_resampled : {array-like, sparse matrix}, shape \ + X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. - y_resampled : array-like, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ @@ -68,18 +68,19 @@ def fit_sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples,) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {array-like, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new,) + y_resampled : array-like, shape (n_samples_new,) The corresponding label of `X_resampled` """ @@ -92,19 +93,21 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` + """ pass diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 7a779e127..e1e094c32 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -281,10 +281,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -305,15 +305,16 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index af55f8b4f..82821df0c 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -232,10 +232,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -256,18 +256,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 8adda9004..274b86759 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -149,10 +149,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -222,15 +222,16 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_subset, n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_subset, n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_subset, n_samples_new) @@ -306,7 +307,6 @@ def _sample(self, X, y): target_stats = Counter(safe_indexing( y, np.flatnonzero(samples_mask))) for target_class in self.ratio_.keys(): - print(target_stats[target_class], self.ratio_[target_class]) if target_stats[target_class] < self.ratio_[target_class]: b_subset_search = False diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index 9a3fff860..5fc018167 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -112,15 +112,16 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_subset, n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_subset, n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_subset, n_samples_new) diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index e5c439b61..e15bfa62b 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -132,20 +132,22 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` + """ self._validate_estimator() random_state = check_random_state(self.random_state) diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 50efc6e74..883fd9be2 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -18,59 +18,3 @@ class BaseOverSampler(BaseSampler): """ _sampling_type = 'over-sampling' - - # def fit(self, X, y): - # """Find the classes statistics before to perform sampling. - - # Parameters - # ---------- - # X : array-like, shape (n_samples, n_features) - # Matrix containing the data which have to be sampled. - - # y : array-like, shape (n_samples,) - # Corresponding label for each sample in X. - - # Returns - # ------- - # self : object, - # Return self. - - # Notes - # ----- - # Over-samplers do not accept sparse matrices. - - # """ - # # over-sampling method does not handle sparse matrix - # X, y = check_X_y(X, y) - - # return super(BaseOverSampler, self).fit(X, y) - - # def sample(self, X, y): - # """Resample the dataset. - - # Parameters - # ---------- - # X : array-like, shape (n_samples, n_features) - # Matrix containing the data which have to be sampled. - - # y : array-like, shape (n_samples,) - # Corresponding label for each sample in X. - - # Returns - # ------- - # X_resampled : array-like, shape (n_samples_new, n_features) - # The array containing the resampled data. - - # y_resampled : array-like, shape (n_samples_new,) - # The corresponding label of `X_resampled` - - # Notes - # ----- - # Over-samplers do not accept sparse matrices. - - # """ - - # # Check the consistency of X and y - # X, y = check_X_y(X, y) - - # return super(BaseOverSampler, self).sample(X, y) diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index d70b45d32..271f1f6e8 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -84,18 +84,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index c1a8d7477..fabe63b42 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -178,13 +178,13 @@ def _in_danger_noise(self, samples, target_class, y, kind='danger'): Parameters ---------- - samples : ndarray, shape (n_samples, n_features) + samples : {array-like, sparse matrix}, shape (n_samples, n_features) The samples to check if either they are in danger or not. target_class : int or str, The target corresponding class being over-sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) The true label in order to check the neighbour labels. kind : str, optional (default='danger') @@ -195,7 +195,7 @@ def _in_danger_noise(self, samples, target_class, y, kind='danger'): Returns ------- - output : ndarray, shape (n_samples, ) + output : ndarray, shape (n_samples,) A boolean array where True refer to samples in danger or noise. """ @@ -226,7 +226,7 @@ def _make_samples(self, Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int @@ -248,10 +248,10 @@ def _make_samples(self, Returns ------- - X_new : ndarray, shape (n_samples_new, n_features) + X_new : {ndarray, sparse matrix}, shape (n_samples_new, n_features) Synthetically generated samples. - y_new : ndarray, shape (n_samples_new, ) + y_new : ndarray, shape (n_samples_new,) Target values for synthetic samples. """ @@ -324,19 +324,20 @@ def _sample_regular(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled`. + y_resampled : ndarray, shape (n_samples_new,) + The corresponding label of `X_resampled` References ---------- @@ -378,19 +379,20 @@ def _sample_borderline(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled`. + y_resampled : ndarray, shape (n_samples_new,) + The corresponding label of `X_resampled` References ---------- @@ -469,19 +471,20 @@ def _sample_svm(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled`. + y_resampled : ndarray, shape (n_samples_new,) + The corresponding label of `X_resampled` References ---------- @@ -569,18 +572,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 0eef20cde..50eb14181 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -9,7 +9,10 @@ from __future__ import division, print_function import numpy as np +from scipy import sparse + from sklearn.cluster import KMeans +from sklearn.utils import safe_indexing from ..base import BaseUnderSampler @@ -109,42 +112,46 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ self._validate_estimator() - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - + idx_under = np.empty((0, ), dtype=int) + centroids, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) - centroids = self.estimator_.cluster_centers_ + centroids.append(self.estimator_.cluster_centers_) + y_resampled += [target_class] * n_samples - X_resampled = np.concatenate((X_resampled, centroids), axis=0) - y_resampled = np.concatenate( - (y_resampled, np.array([target_class] * n_samples)), - axis=0) else: + target_class_indices = np.flatnonzero(y == target_class) + idx_under = np.concatenate( + (idx_under, target_class_indices), axis=0) + y_resampled += [target_class] * target_class_indices.size - X_resampled = np.concatenate( - (X_resampled, X[y == target_class]), axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class]), axis=0) + X_resampled = np.concatenate((centroids)) + + if sparse.issparse(X): + X_resampled = sparse.vstack([sparse.csr_matrix(X_resampled), + safe_indexing(X, idx_under)]) + else: + X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under))) - return X_resampled, y_resampled + return X_resampled, np.array(y_resampled) diff --git a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py index c3f743647..1d03eba9a 100644 --- a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py @@ -158,18 +158,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py index a20a18b70..87c4cb250 100644 --- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py @@ -161,18 +161,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -362,18 +363,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -585,18 +587,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) diff --git a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py index a7f8199f5..637323164 100644 --- a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py +++ b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py @@ -204,18 +204,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) diff --git a/imblearn/under_sampling/prototype_selection/nearmiss.py b/imblearn/under_sampling/prototype_selection/nearmiss.py index 3475d3987..4a5475317 100644 --- a/imblearn/under_sampling/prototype_selection/nearmiss.py +++ b/imblearn/under_sampling/prototype_selection/nearmiss.py @@ -156,10 +156,10 @@ def _selection_dist_based(self, Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Original samples. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Associated label to X. dist_vec : ndarray, shape (n_samples, ) @@ -176,13 +176,7 @@ def _selection_dist_based(self, Returns ------- - X_sel : ndarray, shape (num_samples, n_features) - Selected samples. - - y_sel : ndarray, shape (num_samples, ) - The associated label. - - idx_sel : ndarray, shape (num_samples, ) + idx_sel : ndarray, shape (num_samples,) The list of the indices of the selected samples. """ @@ -247,18 +241,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py index 7bd31bb71..e9f16e6a8 100644 --- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py @@ -148,18 +148,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index faeb0c9f2..e7a209fdd 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -94,18 +94,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data to be sampled. + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) diff --git a/imblearn/under_sampling/prototype_selection/tomek_links.py b/imblearn/under_sampling/prototype_selection/tomek_links.py index 8d8a50067..91b99f03b 100644 --- a/imblearn/under_sampling/prototype_selection/tomek_links.py +++ b/imblearn/under_sampling/prototype_selection/tomek_links.py @@ -144,18 +144,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) From 425928f3c0052b8a25f0d2049396a4959aac1728 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 02:57:46 +0200 Subject: [PATCH 14/28] DOC fix example topic classification --- examples/applications/plot_topic_classication.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/examples/applications/plot_topic_classication.py b/examples/applications/plot_topic_classication.py index 90e48f0c3..e0af19ccf 100644 --- a/examples/applications/plot_topic_classication.py +++ b/examples/applications/plot_topic_classication.py @@ -16,7 +16,6 @@ from collections import Counter from sklearn.datasets import fetch_20newsgroups -from sklearn.preprocessing import FunctionTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline @@ -82,22 +81,10 @@ # use a ``RandomUnderSampler`` to equalize the number of samples in all the # classes before the training. # -# Currently, imbalanced-learn does not handle sparse matrices --- we are -# currently working on bringing this feature --- and an additional transformer -# to convert the sparse to dense matrices is required in the pipeline. -# # It is also important to note that we are using the ``make_pipeline`` function # implemented in imbalanced-learn to properly handle the samplers. - -def densify(X): - """Function to densify an array.""" - return X.toarray() - - pipe = make_pipeline_imb(TfidfVectorizer(), - FunctionTransformer(func=densify, - accept_sparse=True), RandomUnderSampler(), MultinomialNB()) From 4ba8c4e93b476ac41164d2b0164bc0523cefe295 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 12:31:17 +0200 Subject: [PATCH 15/28] FIX fix test and class clustercentroids --- .../prototype_generation/cluster_centroids.py | 2 +- .../tests/test_cluster_centroids.py | 38 ++++++++++++------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 50eb14181..5404ea892 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -144,7 +144,6 @@ def _sample(self, X, y): target_class_indices = np.flatnonzero(y == target_class) idx_under = np.concatenate( (idx_under, target_class_indices), axis=0) - y_resampled += [target_class] * target_class_indices.size X_resampled = np.concatenate((centroids)) @@ -153,5 +152,6 @@ def _sample(self, X, y): safe_indexing(X, idx_under)]) else: X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under))) + y_resampled = np.hstack((y_resampled, safe_indexing(y, idx_under))) return X_resampled, np.array(y_resampled) diff --git a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py index f3d73e67a..d94a8070b 100644 --- a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py @@ -24,10 +24,13 @@ def test_fit_sample_auto(): ratio = 'auto' cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.06738818, -0.529627], - [0.17901516, 0.69860992], [0.094035, -2.55298982]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) + X_gt = np.array([[0.06738818, -0.529627], + [0.17901516, 0.69860992], + [0.094035, -2.55298982], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502]]) + y_gt = np.array([1, 1, 1, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -36,12 +39,16 @@ def test_fit_sample_half(): ratio = .5 cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.09125309, -0.85409574], - [0.19220316, 0.32337101], [0.094035, -2.55298982], - [0.20792588, 1.49407907], [0.04352327, -0.20515826], - [0.12372842, 0.6536186]]) - y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) + X_gt = np.array([[0.09125309, -0.85409574], + [0.19220316, 0.32337101], + [0.094035, -2.55298982], + [0.20792588, 1.49407907], + [0.04352327, -0.20515826], + [0.12372842, 0.6536186], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502]]) + y_gt = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -65,10 +72,13 @@ def test_fit_sample_object(): ratio=ratio, random_state=RND_SEED, estimator=cluster) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.06738818, -0.529627], - [0.17901516, 0.69860992], [0.094035, -2.55298982]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) + X_gt = np.array([[0.06738818, -0.529627], + [0.17901516, 0.69860992], + [0.094035, -2.55298982], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502]]) + y_gt = np.array([1, 1, 1, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) From 8298fdce2c725aaaf7b9466d6ece9b49e73c1b80 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 13:02:21 +0200 Subject: [PATCH 16/28] TST add common test --- .../prototype_generation/cluster_centroids.py | 3 +- imblearn/utils/estimator_checks.py | 42 ++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 5404ea892..0cfebb193 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -82,7 +82,8 @@ class ClusterCentroids(BaseUnderSampler): >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 100, 1: 100}) + ... # doctest: +ELLIPSIS + Resampled dataset shape Counter({...}) """ diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index cbc223f13..1bfb86701 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -12,6 +12,7 @@ from collections import Counter import numpy as np +from scipy import sparse from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import _yield_all_checks \ @@ -20,7 +21,8 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.testing import (assert_warns, assert_raises_regex, assert_true, set_random_state, - assert_equal) + assert_equal, assert_allclose_dense_sparse, + SkipTest) from imblearn.base import SamplerMixin from imblearn.over_sampling.base import BaseOverSampler @@ -36,6 +38,8 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_fit yield check_samplers_fit_sample yield check_samplers_ratio_fit_sample + yield check_samplers_sparse + yield check_samplers_pandas def _yield_all_checks(name, Estimator): @@ -253,3 +257,39 @@ def check_samplers_ratio_fit_sample(name, Sampler): X_res, y_res = sampler.fit_sample(X, y) y_ensemble = y_res[0] assert_equal(target_stats[1], Counter(y_ensemble)[1]) + + +def check_samplers_sparse(name, Sampler): + # check that sparse matrices can be passed through the sampler leading to + # the same results than dense + X, y = make_classification(n_samples=1000, n_classes=3, + n_informative=4, weights=[0.2, 0.3, 0.5], + random_state=0) + X_sparse = sparse.csr_matrix(X) + sampler = Sampler(random_state=0) + if not isinstance(sampler, BaseEnsembleSampler): + X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) + assert_true(sparse.issparse(X_res_sparse)) + X_res, y_res = sampler.fit_sample(X, y) + assert_allclose_dense_sparse(X_res_sparse.A, X_res) + assert_allclose_dense_sparse(y_res_sparse, y_res) + + +def check_samplers_pandas(name, Sampler): + # Check that the samplers handle pandas dataframe and pandas series + X, y = make_classification(n_samples=1000, n_classes=3, + n_informative=4, weights=[0.2, 0.3, 0.5], + random_state=0) + try: + import pandas as pd + X_pd, y_pd = pd.DataFrame(X), pd.Series(y) + sampler = Sampler(random_state=0) + X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd) + X_res, y_res = sampler.fit_sample(X, y) + assert_allclose_dense_sparse(X_res_pd, X_res) + assert_allclose_dense_sparse(y_res_pd, y_res) + + except ImportError: + raise SkipTest("pandas is not installed: not testing for " + "input of type pandas.DataFrame / pandas.Series as" + " input.") From e4c6ebbb0cc9103e0bbde326a45a12772ab9fda6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 13:33:32 +0200 Subject: [PATCH 17/28] TST add ensemble --- appveyor.yml | 2 +- build_tools/travis/install.sh | 2 +- imblearn/utils/estimator_checks.py | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 5616316e5..b1c62ffe8 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -36,7 +36,7 @@ install: - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" # Installed prebuilt dependencies from conda - - "conda install pip numpy scipy scikit-learn=0.19.0 nose wheel matplotlib -y -q" + - "conda install pip numpy scipy scikit-learn=0.19.0 pandas nose wheel matplotlib -y -q" # Install other nilearn dependencies - "pip install coverage nose-timer" diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 843aa5088..1179ddaf9 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,7 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION pandas if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 1bfb86701..91b78670e 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -267,12 +267,17 @@ def check_samplers_sparse(name, Sampler): random_state=0) X_sparse = sparse.csr_matrix(X) sampler = Sampler(random_state=0) + X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) + X_res, y_res = sampler.fit_sample(X, y) if not isinstance(sampler, BaseEnsembleSampler): - X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) assert_true(sparse.issparse(X_res_sparse)) - X_res, y_res = sampler.fit_sample(X, y) assert_allclose_dense_sparse(X_res_sparse.A, X_res) assert_allclose_dense_sparse(y_res_sparse, y_res) + else: + for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res): + assert_true(sparse.issparse(x_sp)) + assert_allclose_dense_sparse(x_sp.A, x, rtol=1e-06, atol=1e-06) + assert_allclose_dense_sparse(y_sp, y) def check_samplers_pandas(name, Sampler): From 1226a91c5951a3e63d603a4f6d2ae77f0de1ad76 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 13:46:29 +0200 Subject: [PATCH 18/28] TST use allclose --- imblearn/utils/estimator_checks.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 91b78670e..9d218e5a0 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -21,7 +21,7 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.testing import (assert_warns, assert_raises_regex, assert_true, set_random_state, - assert_equal, assert_allclose_dense_sparse, + assert_equal, assert_allclose, SkipTest) from imblearn.base import SamplerMixin @@ -271,13 +271,13 @@ def check_samplers_sparse(name, Sampler): X_res, y_res = sampler.fit_sample(X, y) if not isinstance(sampler, BaseEnsembleSampler): assert_true(sparse.issparse(X_res_sparse)) - assert_allclose_dense_sparse(X_res_sparse.A, X_res) - assert_allclose_dense_sparse(y_res_sparse, y_res) + assert_allclose(X_res_sparse.A, X_res) + assert_allclose(y_res_sparse, y_res) else: for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res): assert_true(sparse.issparse(x_sp)) - assert_allclose_dense_sparse(x_sp.A, x, rtol=1e-06, atol=1e-06) - assert_allclose_dense_sparse(y_sp, y) + assert_allclose(x_sp.A, x, rtol=1e-06, atol=1e-06) + assert_allclose(y_sp, y) def check_samplers_pandas(name, Sampler): @@ -291,8 +291,8 @@ def check_samplers_pandas(name, Sampler): sampler = Sampler(random_state=0) X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd) X_res, y_res = sampler.fit_sample(X, y) - assert_allclose_dense_sparse(X_res_pd, X_res) - assert_allclose_dense_sparse(y_res_pd, y_res) + assert_allclose(X_res_pd, X_res) + assert_allclose(y_res_pd, y_res) except ImportError: raise SkipTest("pandas is not installed: not testing for " From 68b16b5a35f643d061e25041768a852df9828749 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 13:48:27 +0200 Subject: [PATCH 19/28] TST install conda with ubuntu container --- build_tools/travis/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 1179ddaf9..2b590e860 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -59,7 +59,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # Create a new virtualenv using system site packages for python, numpy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install scikit-learn nose nose-timer pytest pytest-cov codecov + pip install scikit-learn pandas nose nose-timer pytest pytest-cov codecov fi From 35c638bd3a2a0b11d169bd5d9980d2b6c239fac6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 15:22:25 +0200 Subject: [PATCH 20/28] TST increase tolerance --- imblearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 9d218e5a0..ede06c658 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -271,7 +271,7 @@ def check_samplers_sparse(name, Sampler): X_res, y_res = sampler.fit_sample(X, y) if not isinstance(sampler, BaseEnsembleSampler): assert_true(sparse.issparse(X_res_sparse)) - assert_allclose(X_res_sparse.A, X_res) + assert_allclose(X_res_sparse.A, X_res, rtol=1e-06, atol=1e-06) assert_allclose(y_res_sparse, y_res) else: for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res): From 004f9203111848481912203d0c504009680ebf96 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 15:31:21 +0200 Subject: [PATCH 21/28] TST increase tolerance --- imblearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index ede06c658..ce17b6d8d 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -271,12 +271,12 @@ def check_samplers_sparse(name, Sampler): X_res, y_res = sampler.fit_sample(X, y) if not isinstance(sampler, BaseEnsembleSampler): assert_true(sparse.issparse(X_res_sparse)) - assert_allclose(X_res_sparse.A, X_res, rtol=1e-06, atol=1e-06) + assert_allclose(X_res_sparse.A, X_res, rtol=1e-05, atol=1e-05) assert_allclose(y_res_sparse, y_res) else: for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res): assert_true(sparse.issparse(x_sp)) - assert_allclose(x_sp.A, x, rtol=1e-06, atol=1e-06) + assert_allclose(x_sp.A, x, rtol=1e-05, atol=1e-05) assert_allclose(y_sp, y) From d3ceb5a2c946f0b6063af70ddce94e2a2368e6bf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 16:00:10 +0200 Subject: [PATCH 22/28] TST test all versions NearMiss and SMOTE --- imblearn/utils/estimator_checks.py | 57 ++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index ce17b6d8d..2385f91bc 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -28,6 +28,8 @@ from imblearn.over_sampling.base import BaseOverSampler from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler from imblearn.ensemble.base import BaseEnsembleSampler +from imblearn.over_sampling import SMOTE +from imblearn.under_sampling import NearMiss, ClusterCentroids def _yield_sampler_checks(name, Estimator): @@ -266,18 +268,33 @@ def check_samplers_sparse(name, Sampler): n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) X_sparse = sparse.csr_matrix(X) - sampler = Sampler(random_state=0) - X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) - X_res, y_res = sampler.fit_sample(X, y) - if not isinstance(sampler, BaseEnsembleSampler): - assert_true(sparse.issparse(X_res_sparse)) - assert_allclose(X_res_sparse.A, X_res, rtol=1e-05, atol=1e-05) - assert_allclose(y_res_sparse, y_res) + if isinstance(Sampler(), SMOTE): + samplers = [Sampler(random_state=0, kind=kind) + for kind in ('regular', 'borderline1', + 'borderline2', 'svm')] + elif isinstance(Sampler(), NearMiss): + samplers = [Sampler(random_state=0, version=version) + for version in (1, 2, 3)] else: - for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res): - assert_true(sparse.issparse(x_sp)) - assert_allclose(x_sp.A, x, rtol=1e-05, atol=1e-05) - assert_allclose(y_sp, y) + samplers = [Sampler(random_state=0)] + for sampler in samplers: + X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) + X_res, y_res = sampler.fit_sample(X, y) + if not isinstance(sampler, BaseEnsembleSampler): + if not isinstance(sampler, ClusterCentroids): + assert_true(sparse.issparse(X_res_sparse)) + assert_allclose(X_res_sparse.A, X_res) + assert_allclose(y_res_sparse, y_res) + else: + assert_true(sparse.issparse(X_res_sparse)) + assert_allclose(X_res_sparse.A, X_res, rtol=1e-4, atol=1e-4) + assert_allclose(y_res_sparse, y_res) + else: + for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, + y_res_sparse, y_res): + assert_true(sparse.issparse(x_sp)) + assert_allclose(x_sp.A, x) + assert_allclose(y_sp, y) def check_samplers_pandas(name, Sampler): @@ -289,10 +306,20 @@ def check_samplers_pandas(name, Sampler): import pandas as pd X_pd, y_pd = pd.DataFrame(X), pd.Series(y) sampler = Sampler(random_state=0) - X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd) - X_res, y_res = sampler.fit_sample(X, y) - assert_allclose(X_res_pd, X_res) - assert_allclose(y_res_pd, y_res) + if isinstance(Sampler(), SMOTE): + samplers = [Sampler(random_state=0, kind=kind) + for kind in ('regular', 'borderline1', + 'borderline2', 'svm')] + elif isinstance(Sampler(), NearMiss): + samplers = [Sampler(random_state=0, version=version) + for version in (1, 2, 3)] + else: + samplers = [Sampler(random_state=0)] + for sampler in samplers: + X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd) + X_res, y_res = sampler.fit_sample(X, y) + assert_allclose(X_res_pd, X_res) + assert_allclose(y_res_pd, y_res) except ImportError: raise SkipTest("pandas is not installed: not testing for " From d9c4e555939a778ae7fb8d8c5ee937ae53b3a139 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 17:48:30 +0200 Subject: [PATCH 23/28] TST set the algorithm of KMeans --- imblearn/utils/estimator_checks.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 2385f91bc..9fe958bf2 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -15,6 +15,7 @@ from scipy import sparse from sklearn.datasets import make_classification +from sklearn.cluster import KMeans from sklearn.utils.estimator_checks import _yield_all_checks \ as sklearn_yield_all_checks, check_estimator \ as sklearn_check_estimator, check_parameters_default_constructible @@ -275,6 +276,11 @@ def check_samplers_sparse(name, Sampler): elif isinstance(Sampler(), NearMiss): samplers = [Sampler(random_state=0, version=version) for version in (1, 2, 3)] + elif isinstance(Sampler(), ClusterCentroids): + # set KMeans to full since it support sparse and dense + samplers = [Sampler(random_state=0, + estimator=KMeans(random_state=1, + algorithm='full'))] else: samplers = [Sampler(random_state=0)] for sampler in samplers: From b4697472dedb77188f661619b994d93f76f3ccc2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 18:14:37 +0200 Subject: [PATCH 24/28] DOC add entry in user guide --- doc/introduction.rst | 51 +++++++++++++++++++++++++++++++++++++++ doc/problem_statement.rst | 20 --------------- doc/user_guide.rst | 2 +- 3 files changed, 52 insertions(+), 21 deletions(-) create mode 100644 doc/introduction.rst delete mode 100644 doc/problem_statement.rst diff --git a/doc/introduction.rst b/doc/introduction.rst new file mode 100644 index 000000000..3261c6321 --- /dev/null +++ b/doc/introduction.rst @@ -0,0 +1,51 @@ +.. _introduction: + +============ +Introduction +============ + +.. _api_imblearn: + +API's of imbalanced-learn samplers +---------------------------------- + +The sampler available follows the scikit-learn API using the estimator base +object with an addtionnal sample method: + +:Estimator: + + The base object, implements a ``fit`` method to learn from data, either:: + + estimator = obj.fit(data, targets) + +:Sampler: + + To resample a data sets, each sampler implements:: + + prediction = obj.predict(data, targets) + +Imbalanced-learn samplers accept the same inputs that in scikit-learn: + +* ``data``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse + matrices; +* ``targets``: array-like (1-D list, pandas.Series, numpy.array). + +.. _problem_statement: + +Problem statement regarding imbalanced data sets +------------------------------------------------ + +The learning phase and the subsequent prediction of machine learning algorithms +can be affected by the problem of imbalanced data set. The balancing issue +corresponds to the difference of the number of samples in the different +classes. We illustrate the effect of training a linear SVM classifier with +different level of class balancing. + +.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png + :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html + :scale: 60 + :align: center + +As expected, the decision function of the linear SVM is highly impacted. With a +greater imbalanced ratio, the decision function favor the class with the larger +number of samples, usually referred as the majority class. diff --git a/doc/problem_statement.rst b/doc/problem_statement.rst deleted file mode 100644 index 7b1a87e88..000000000 --- a/doc/problem_statement.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _problem_statement: - -================= -Problem statement -================= - -The learning phase and the subsequent prediction of machine learning algorithms -can be affected by the problem of imbalanced data set. The balancing issue -corresponds to the difference of the number of samples in the different -classes. We illustrate the effect of training a linear SVM classifier with -different level of class balancing. - -.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png - :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html - :scale: 60 - :align: center - -As expected, the decision function of the linear SVM is highly impacted. With a -greater imbalanced ratio, the decision function favor the class with the larger -number of samples, usually referred as the majority class. diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 88c5f8f92..7077f313f 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -9,7 +9,7 @@ User Guide .. toctree:: :numbered: - problem_statement.rst + introduction.rst over_sampling.rst under_sampling.rst combine.rst From c05d0ba102c0ce0d52e8a4aefd28d48eff7d359f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 18:30:41 +0200 Subject: [PATCH 25/28] DOC add entry sparse for CC --- doc/introduction.rst | 7 +++++++ doc/under_sampling.rst | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/doc/introduction.rst b/doc/introduction.rst index 3261c6321..9bf1a6e0b 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -30,6 +30,13 @@ Imbalanced-learn samplers accept the same inputs that in scikit-learn: matrices; * ``targets``: array-like (1-D list, pandas.Series, numpy.array). +.. topic:: Sparse input + + For sparse input the data is **converted to the Compressed Sparse Rows + representation** (see ``scipy.sparse.csr_matrix``) before being fed to the + sampler. To avoid unnecessary memory copies, it is recommended to choose the + CSR representation upstream. + .. _problem_statement: Problem statement regarding imbalanced data sets diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index cc292471c..78847122f 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -49,6 +49,12 @@ your data are grouped into clusters. In addition, the number of centroids should be set such that the under-sampled clusters are representative of the original one. +.. warning:: + + :class:`ClusterCentroids` supports sparse matrices. However, the new samples + are generated are not specifically sparse. Therefore, even if the resulting + matrix will be sparse, the algorithm will be inefficient in this regard. + See :ref:`sphx_glr_auto_examples_under-sampling_plot_cluster_centroids.py` and :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`. From 1625879d6dc3ded60ed2f02c17f1f0f9a4519bf6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 18:33:04 +0200 Subject: [PATCH 26/28] DOC whatsnew entry --- doc/whats_new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index b5f61c85b..54f6d441f 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -44,6 +44,9 @@ New features Enhancement ~~~~~~~~~~~ +- All samplers accepts sparse matrices with defaulting on CSR type. By + `Guillaume Lemaitre`_. + - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It supports multiclass. By `Guillaume Lemaitre`_. From 72a605d2b902bc8a085a751412b95c3af76e4e06 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 20:03:25 +0200 Subject: [PATCH 27/28] EHN add voting paramter for ClusterCentroids --- doc/under_sampling.rst | 20 +++-- doc/whats_new.rst | 4 + .../under-sampling/plot_cluster_centroids.py | 40 +++++++--- .../prototype_generation/cluster_centroids.py | 70 +++++++++++++---- .../tests/test_cluster_centroids.py | 76 ++++++++++++++----- 5 files changed, 163 insertions(+), 47 deletions(-) diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 78847122f..7e1d69e7f 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -49,11 +49,21 @@ your data are grouped into clusters. In addition, the number of centroids should be set such that the under-sampled clusters are representative of the original one. -.. warning:: - - :class:`ClusterCentroids` supports sparse matrices. However, the new samples - are generated are not specifically sparse. Therefore, even if the resulting - matrix will be sparse, the algorithm will be inefficient in this regard. +:class:`ClusterCentroids` accepts sparse matrices. However, it is recommended +to set ``voting`` to not set ``'soft'`` since the centroids found by the +clustering method will be used. Those centroids are not enforce to be sparse +and thus the output will not be memory efficient. Note that by default +``voting`` is set to ``'auto'`` which will automatically chose a ``'hard'`` +voting instead of ``'soft'`` voting in the case of a sparse input. + +The effect of the ``voting`` parameter is illustrated in the figure below. When +``voting`` is set to ``'hard'`` the nearest-neighbor of the centroids are used +instead of the centroids itself when using ``'soft'`` voting. + +.. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_cluster_centroids_001.png + :target: ./auto_examples/under-sampling/plot_cluster_centroids.html + :scale: 60 + :align: center See :ref:`sphx_glr_auto_examples_under-sampling_plot_cluster_centroids.py` and :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`. diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 54f6d441f..e24cf9fdf 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -35,6 +35,10 @@ Bug fixes New features ~~~~~~~~~~~~ +- :class:`under_sampling.ClusterCentroids` accepts a parameter ``voting`` + allowing to use nearest-neighbors of centroids instead of centroids + themselves. It is more efficient for sparse input. By `Guillaume Lemaitre`_. + - Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By `Christos Aridas`_. diff --git a/examples/under-sampling/plot_cluster_centroids.py b/examples/under-sampling/plot_cluster_centroids.py index d6f7eaf25..d13b669e3 100644 --- a/examples/under-sampling/plot_cluster_centroids.py +++ b/examples/under-sampling/plot_cluster_centroids.py @@ -24,7 +24,7 @@ X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, - n_samples=200, random_state=10) + n_samples=50, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) @@ -34,10 +34,15 @@ # Apply Cluster Centroids cc = ClusterCentroids() X_resampled, y_resampled = cc.fit_sample(X, y) -X_res_vis = pca.transform(X_resampled) +X_res_vis_soft = pca.transform(X_resampled) + +# Use hard voting instead of soft voting +cc = ClusterCentroids(voting='hard') +X_resampled, y_resampled = cc.fit_sample(X, y) +X_res_vis_hard = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately -f, (ax1, ax2) = plt.subplots(1, 2) +f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5)) c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5) @@ -45,14 +50,30 @@ alpha=0.5) ax1.set_title('Original set') -ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], +ax2.scatter(X_res_vis_soft[y_resampled == 0, 0], + X_res_vis_soft[y_resampled == 0, 1], + label="Class #0", alpha=.5) +ax2.scatter(X_res_vis_soft[y_resampled == 1, 0], + X_res_vis_soft[y_resampled == 1, 1], + label="Class #1", alpha=.5) +c2 = ax2.scatter(X_vis[y == 1, 0], + X_vis[y == 1, 1], label="Original #1", + alpha=0.2) +ax2.set_title('Cluster centroids with soft voting') + +ax3.scatter(X_res_vis_hard[y_resampled == 0, 0], + X_res_vis_hard[y_resampled == 0, 1], label="Class #0", alpha=.5) -ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1], +ax3.scatter(X_res_vis_hard[y_resampled == 1, 0], + X_res_vis_hard[y_resampled == 1, 1], label="Class #1", alpha=.5) -ax2.set_title('Cluster centroids') +ax3.scatter(X_vis[y == 1, 0], + X_vis[y == 1, 1], + alpha=0.2) +ax3.set_title('Cluster centroids with hard voting') # make nice plotting -for ax in (ax1, ax2): +for ax in (ax1, ax2, ax3): ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() @@ -62,7 +83,8 @@ ax.set_xlim([-6, 8]) ax.set_ylim([-6, 6]) -plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center', - ncol=2, labelspacing=0.) +plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'), + loc='lower center', + ncol=3, labelspacing=0.) plt.tight_layout(pad=3) plt.show() diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 0cfebb193..aef3fdbe1 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -12,10 +12,13 @@ from scipy import sparse from sklearn.cluster import KMeans +from sklearn.neighbors import NearestNeighbors from sklearn.utils import safe_indexing from ..base import BaseUnderSampler +VOTING_KIND = ('auto', 'hard', 'soft') + class ClusterCentroids(BaseUnderSampler): """Perform under-sampling by generating centroids based on @@ -58,6 +61,18 @@ class ClusterCentroids(BaseUnderSampler): estimator : object, optional(default=KMeans()) Pass a :class:`sklearn.cluster.KMeans` estimator. + voting : str, optional (default='auto') + Voting strategy to generate the new samples: + + - If ``'hard'``, the nearest-neighbors of the centroids found using the + clustering algorithm will be used. + - If ``'soft'``, the centroids found by the clustering algorithm will + be used. + - If ``'auto'``, if the input is sparse, it will default on ``'hard'`` + otherwise, ``'soft'`` will be used. + + .. versionadded:: 0.3.0 + n_jobs : int, optional (default=1) The number of threads to open if possible. @@ -91,10 +106,12 @@ def __init__(self, ratio='auto', random_state=None, estimator=None, + voting='auto', n_jobs=1): super(ClusterCentroids, self).__init__( ratio=ratio, random_state=random_state) self.estimator = estimator + self.voting = voting self.n_jobs = n_jobs def _validate_estimator(self): @@ -108,6 +125,22 @@ def _validate_estimator(self): raise ValueError('`estimator` has to be a KMeans clustering.' ' Got {} instead.'.format(type(self.estimator))) + def _generate_sample(self, X, y, centroids, target_class): + if self.voting_ == 'hard': + nearest_neighbors = NearestNeighbors(n_neighbors=1) + nearest_neighbors.fit(X, y) + indices = nearest_neighbors.kneighbors(centroids, + return_distance=False) + X_new = safe_indexing(X, np.squeeze(indices)) + else: + if sparse.issparse(X): + X_new = sparse.csr_matrix(centroids) + else: + X_new = centroids + y_new = np.array([target_class] * centroids.shape[0]) + + return X_new, y_new + def _sample(self, X, y): """Resample the dataset. @@ -131,28 +164,37 @@ def _sample(self, X, y): """ self._validate_estimator() - idx_under = np.empty((0, ), dtype=int) - centroids, y_resampled = [], [] + if self.voting == 'auto': + if sparse.issparse(X): + self.voting_ = 'hard' + else: + self.voting_ = 'soft' + else: + if self.voting in VOTING_KIND: + self.voting_ = self.voting + else: + raise ValueError("'voting' needs to be one of {}. Got {}" + " instead.".format(VOTING_KIND, self.voting)) + + X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) - centroids.append(self.estimator_.cluster_centers_) - y_resampled += [target_class] * n_samples - + X_new, y_new = self._generate_sample( + X, y, self.estimator_.cluster_centers_, target_class) + X_resampled.append(X_new) + y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) - idx_under = np.concatenate( - (idx_under, target_class_indices), axis=0) - - X_resampled = np.concatenate((centroids)) + X_resampled.append(safe_indexing(X, target_class_indices)) + y_resampled.append(safe_indexing(y, target_class_indices)) if sparse.issparse(X): - X_resampled = sparse.vstack([sparse.csr_matrix(X_resampled), - safe_indexing(X, idx_under)]) + X_resampled = sparse.vstack(X_resampled) else: - X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under))) - y_resampled = np.hstack((y_resampled, safe_indexing(y, idx_under))) + X_resampled = np.vstack(X_resampled) + y_resampled = np.hstack(y_resampled) - return X_resampled, np.array(y_resampled) + return X_resampled, y_resampled diff --git a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py index d94a8070b..7a501e003 100644 --- a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py @@ -4,8 +4,10 @@ from collections import Counter import numpy as np +from scipy import sparse from sklearn.utils.testing import (assert_allclose, assert_array_equal, - assert_equal, assert_raises_regex) + assert_equal, assert_raises_regex, + assert_true) from sklearn.cluster import KMeans from imblearn.under_sampling import ClusterCentroids @@ -20,17 +22,26 @@ R_TOL = 1e-4 +def test_fit_sample_check_voting(): + cc = ClusterCentroids(random_state=RND_SEED) + cc.fit_sample(X, Y) + assert_equal(cc.voting_, 'soft') + cc = ClusterCentroids(random_state=RND_SEED) + cc.fit_sample(sparse.csr_matrix(X), Y) + assert_equal(cc.voting_, 'hard') + + def test_fit_sample_auto(): ratio = 'auto' cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.06738818, -0.529627], - [0.17901516, 0.69860992], - [0.094035, -2.55298982], - [0.92923648, 0.76103773], + X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502]]) - y_gt = np.array([1, 1, 1, 0, 0, 0]) + [0.13347175, 0.12167502], + [0.06738818, -0.529627], + [0.17901516, 0.69860992], + [0.094035, -2.55298982]]) + y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -39,16 +50,16 @@ def test_fit_sample_half(): ratio = .5 cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.09125309, -0.85409574], + X_gt = np.array([[0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502], + [0.09125309, -0.85409574], [0.19220316, 0.32337101], [0.094035, -2.55298982], [0.20792588, 1.49407907], [0.04352327, -0.20515826], - [0.12372842, 0.6536186], - [0.92923648, 0.76103773], - [0.47104475, 0.44386323], - [0.13347175, 0.12167502]]) - y_gt = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0]) + [0.12372842, 0.6536186]]) + y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -72,21 +83,48 @@ def test_fit_sample_object(): ratio=ratio, random_state=RND_SEED, estimator=cluster) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.06738818, -0.529627], + X_gt = np.array([[0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502], + [0.06738818, -0.529627], [0.17901516, 0.69860992], - [0.094035, -2.55298982], - [0.92923648, 0.76103773], + [0.094035, -2.55298982]]) + y_gt = np.array([0, 0, 0, 1, 1, 1]) + assert_allclose(X_resampled, X_gt, rtol=R_TOL) + assert_array_equal(y_resampled, y_gt) + + +def test_fit_hard_voting(): + ratio = 'auto' + voting = 'hard' + cluster = KMeans(random_state=RND_SEED) + cc = ClusterCentroids( + ratio=ratio, random_state=RND_SEED, estimator=cluster, + voting=voting) + + X_resampled, y_resampled = cc.fit_sample(X, Y) + X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502]]) - y_gt = np.array([1, 1, 1, 0, 0, 0]) + [0.13347175, 0.12167502], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.094035, -2.55298982]]) + y_gt = np.array([0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) + for x in X_resampled: + assert_true(np.any(np.all(x == X, axis=1))) -def test_fit_sample_wrong_object(): +def test_fit_sample_error(): ratio = 'auto' cluster = 'rnd' cc = ClusterCentroids( ratio=ratio, random_state=RND_SEED, estimator=cluster) assert_raises_regex(ValueError, "has to be a KMeans clustering", cc.fit_sample, X, Y) + + voting = 'unknown' + cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED) + assert_raises_regex(ValueError, "needs to be one of", + cc.fit_sample, X, Y) From e1ffb13a5ec6232dcbe743a7e1be985cf943aff9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 Aug 2017 20:45:02 +0200 Subject: [PATCH 28/28] TST fix common test fixing voting --- imblearn/utils/estimator_checks.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 9fe958bf2..7a851cc06 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -279,6 +279,7 @@ def check_samplers_sparse(name, Sampler): elif isinstance(Sampler(), ClusterCentroids): # set KMeans to full since it support sparse and dense samplers = [Sampler(random_state=0, + voting='soft', estimator=KMeans(random_state=1, algorithm='full'))] else: @@ -287,14 +288,9 @@ def check_samplers_sparse(name, Sampler): X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) X_res, y_res = sampler.fit_sample(X, y) if not isinstance(sampler, BaseEnsembleSampler): - if not isinstance(sampler, ClusterCentroids): assert_true(sparse.issparse(X_res_sparse)) assert_allclose(X_res_sparse.A, X_res) assert_allclose(y_res_sparse, y_res) - else: - assert_true(sparse.issparse(X_res_sparse)) - assert_allclose(X_res_sparse.A, X_res, rtol=1e-4, atol=1e-4) - assert_allclose(y_res_sparse, y_res) else: for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res):