From a68e8eb00e52a5698b2c61804fec015d143a3c54 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 12 Aug 2017 13:01:11 +0200
Subject: [PATCH 01/28] EHN POC sparse handling for RandomUnderSampler

---
 imblearn/base.py                              | 21 +++----
 imblearn/over_sampling/base.py                | 58 +++++++++++++++++++
 .../random_under_sampler.py                   | 25 +++-----
 3 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index af3d0536d..08b1b6adf 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -38,24 +38,25 @@ def sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X :  {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {array-like, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : array-like, shape (n_samples_new)
             The corresponding label of `X_resampled`
 
         """
 
         # Check the consistency of X and y
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
 
         check_is_fitted(self, 'ratio_')
         self._check_X_y(X, y)
@@ -70,7 +71,7 @@ def fit_sample(self, X, y):
         X : ndarray, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : ndarray, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
@@ -78,7 +79,7 @@ def fit_sample(self, X, y):
         X_resampled : ndarray, shape (n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         """
@@ -138,10 +139,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
@@ -150,7 +151,7 @@ def fit(self, X, y):
             Return self.
 
         """
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
         y = check_target_type(y)
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
         # self.sampling_type is already checked in check_ratio
diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py
index 9c1f6d51b..175ee4e47 100644
--- a/imblearn/over_sampling/base.py
+++ b/imblearn/over_sampling/base.py
@@ -5,6 +5,8 @@
 #          Christos Aridas
 # License: MIT
 
+from sklearn.utils import check_X_y
+
 from ..base import BaseSampler
 
 
@@ -16,3 +18,59 @@ class BaseOverSampler(BaseSampler):
     """
 
     _sampling_type = 'over-sampling'
+
+    def fit(self, X, y):
+        """Find the classes statistics before to perform sampling.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : array-like, shape (n_samples,)
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        self : object,
+            Return self.
+
+        Notes
+        -----
+        Over-samplers do not accept sparse matrices.
+
+        """
+        # over-sampling method does not handle sparse matrix
+        X, y = check_X_y(X, y)
+
+        return super(BaseOverSampler, self).fit(X, y)
+
+    def sample(self, X, y):
+        """Resample the dataset.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : array-like, shape (n_samples,)
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        X_resampled : array-like, shape (n_samples_new, n_features)
+            The array containing the resampled data.
+
+        y_resampled : array-like, shape (n_samples_new,)
+            The corresponding label of `X_resampled`
+
+        Notes
+        -----
+        Over-samplers do not accept sparse matrices.
+
+        """
+
+        # Check the consistency of X and y
+        X, y = check_X_y(X, y)
+
+        return super(BaseOverSampler, self).sample(X, y)
diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
index 9fa242363..5adfb8055 100644
--- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py
+++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
@@ -7,7 +7,7 @@
 from __future__ import division
 
 import numpy as np
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from ..base import BaseUnderSampler
 
@@ -110,10 +110,7 @@ def _sample(self, X, y):
         """
         random_state = check_random_state(self.random_state)
 
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-        if self.return_indices:
-            idx_under = np.empty((0, ), dtype=int)
+        idx_under = np.empty((0, ), dtype=int)
 
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
@@ -125,18 +122,12 @@ def _sample(self, X, y):
             else:
                 index_target_class = slice(None)
 
-            X_resampled = np.concatenate(
-                (X_resampled, X[y == target_class][index_target_class]),
-                axis=0)
-            y_resampled = np.concatenate(
-                (y_resampled, y[y == target_class][index_target_class]),
-                axis=0)
-            if self.return_indices:
-                idx_under = np.concatenate(
-                    (idx_under, np.flatnonzero(y == target_class)[
-                        index_target_class]), axis=0)
+            idx_under = np.concatenate(
+                (idx_under, np.flatnonzero(y == target_class)[
+                    index_target_class]), axis=0)
 
         if self.return_indices:
-            return X_resampled, y_resampled, idx_under
+            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
+                    idx_under)
         else:
-            return X_resampled, y_resampled
+            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)

From 0062d6d6c90119628b0abab51e9afa36a79db46f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 12 Aug 2017 13:49:11 +0200
Subject: [PATCH 02/28] EHN support sparse ENN

---
 .../edited_nearest_neighbours.py              | 32 ++++++++-----------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
index 5d5011542..afc6cdbc9 100644
--- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
+++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
@@ -14,6 +14,8 @@
 import numpy as np
 from scipy.stats import mode
 
+from sklearn.utils import safe_indexing
+
 from ..base import BaseCleaningSampler
 from ...utils import check_neighbors_object
 from ...utils.deprecation import deprecate_parameter
@@ -167,20 +169,20 @@ def _sample(self, X, y):
         """
         self._validate_estimator()
 
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-        if self.return_indices:
-            idx_under = np.empty((0, ), dtype=int)
+        idx_under = np.empty((0, ), dtype=int)
 
         self.nn_.fit(X)
 
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
-                X_class = X[y == target_class]
-                y_class = y[y == target_class]
+                target_class_indices = np.flatnonzero(y == target_class)
+                X_class = safe_indexing(X, target_class_indices)
+                y_class = safe_indexing(y, target_class_indices)
+                print(target_class_indices)
                 nnhood_idx = self.nn_.kneighbors(
                     X_class, return_distance=False)[:, 1:]
                 nnhood_label = y[nnhood_idx]
+                print(nnhood_idx)
                 if self.kind_sel == 'mode':
                     nnhood_label, _ = mode(nnhood_label, axis=1)
                     nnhood_bool = np.ravel(nnhood_label) == y_class
@@ -191,21 +193,15 @@ def _sample(self, X, y):
             else:
                 index_target_class = slice(None)
 
-            X_resampled = np.concatenate(
-                (X_resampled, X[y == target_class][index_target_class]),
-                axis=0)
-            y_resampled = np.concatenate(
-                (y_resampled, y[y == target_class][index_target_class]),
-                axis=0)
-            if self.return_indices:
-                idx_under = np.concatenate(
-                    (idx_under, np.flatnonzero(y == target_class)[
-                        index_target_class]), axis=0)
+            idx_under = np.concatenate(
+                (idx_under, np.flatnonzero(y == target_class)[
+                    index_target_class]), axis=0)
 
         if self.return_indices:
-            return X_resampled, y_resampled, idx_under
+            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
+                    idx_under)
         else:
-            return X_resampled, y_resampled
+            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
 
 
 class RepeatedEditedNearestNeighbours(BaseCleaningSampler):

From 6197d80539fa557e21edeb7fa3b80d7426892ae6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 12 Aug 2017 15:53:00 +0200
Subject: [PATCH 03/28] iter

---
 .../prototype_selection/edited_nearest_neighbours.py            | 2 --
 1 file changed, 2 deletions(-)

diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
index afc6cdbc9..aa7fda65c 100644
--- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
+++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
@@ -178,11 +178,9 @@ def _sample(self, X, y):
                 target_class_indices = np.flatnonzero(y == target_class)
                 X_class = safe_indexing(X, target_class_indices)
                 y_class = safe_indexing(y, target_class_indices)
-                print(target_class_indices)
                 nnhood_idx = self.nn_.kneighbors(
                     X_class, return_distance=False)[:, 1:]
                 nnhood_label = y[nnhood_idx]
-                print(nnhood_idx)
                 if self.kind_sel == 'mode':
                     nnhood_label, _ = mode(nnhood_label, axis=1)
                     nnhood_bool = np.ravel(nnhood_label) == y_class

From f6698433250323bbe7a401f3e6e341092267c5d7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 12 Aug 2017 16:35:27 +0200
Subject: [PATCH 04/28] EHN sparse indexing IHT

---
 .../instance_hardness_threshold.py            | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py
index 7de2f9cdb..551751599 100644
--- a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py
+++ b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py
@@ -16,6 +16,7 @@
 from sklearn.base import ClassifierMixin
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.externals.six import string_types
+from sklearn.utils import safe_indexing
 
 from ..base import BaseCleaningSampler
 
@@ -219,8 +220,10 @@ def _sample(self, X, y):
         probabilities = np.zeros(y.shape[0], dtype=float)
 
         for train_index, test_index in skf:
-            X_train, X_test = X[train_index], X[test_index]
-            y_train, y_test = y[train_index], y[test_index]
+            X_train = safe_indexing(X, train_index)
+            X_test = safe_indexing(X, test_index)
+            y_train = safe_indexing(y, train_index)
+            y_test = safe_indexing(y, test_index)
 
             self.estimator_.fit(X_train, y_train)
 
@@ -231,10 +234,7 @@ def _sample(self, X, y):
                 for l, c in enumerate(y_test)
             ]
 
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-        if self.return_indices:
-            idx_under = np.empty((0, ), dtype=int)
+        idx_under = np.empty((0, ), dtype=int)
 
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
@@ -247,18 +247,12 @@ def _sample(self, X, y):
             else:
                 index_target_class = slice(None)
 
-            X_resampled = np.concatenate(
-                (X_resampled, X[y == target_class][index_target_class]),
-                axis=0)
-            y_resampled = np.concatenate(
-                (y_resampled, y[y == target_class][index_target_class]),
-                axis=0)
-            if self.return_indices:
-                idx_under = np.concatenate(
-                    (idx_under, np.flatnonzero(y == target_class)[
-                        index_target_class]), axis=0)
+            idx_under = np.concatenate(
+                (idx_under, np.flatnonzero(y == target_class)[
+                    index_target_class]), axis=0)
 
         if self.return_indices:
-            return X_resampled, y_resampled, idx_under
+            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
+                    idx_under)
         else:
-            return X_resampled, y_resampled
+            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)

From 4adc6dbe3c3e7dfb6b65a3855496398978c85b37 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 12 Aug 2017 16:50:33 +0200
Subject: [PATCH 05/28] EHN sparse support nearmiss

---
 .../prototype_selection/nearmiss.py           | 43 +++++++++----------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/imblearn/under_sampling/prototype_selection/nearmiss.py b/imblearn/under_sampling/prototype_selection/nearmiss.py
index c7622ee67..2c5771397 100644
--- a/imblearn/under_sampling/prototype_selection/nearmiss.py
+++ b/imblearn/under_sampling/prototype_selection/nearmiss.py
@@ -11,6 +11,8 @@
 
 import numpy as np
 
+from sklearn.utils import safe_indexing
+
 from ..base import BaseUnderSampler
 from ...utils import check_neighbors_object
 from ...utils.deprecation import deprecate_parameter
@@ -181,7 +183,9 @@ def _selection_dist_based(self,
         # Compute the distance considering the farthest neighbour
         dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors:], axis=1)
 
-        if dist_vec.shape[0] != X[y == key].shape[0]:
+        target_class_indices = np.flatnonzero(y == key)
+        if (dist_vec.shape[0] != safe_indexing(X,
+                                               target_class_indices).shape[0]):
             raise RuntimeError('The samples to be selected do not correspond'
                                ' to the distance matrix given. Ensure that'
                                ' both `X[y == key]` and `dist_vec` are'
@@ -257,21 +261,20 @@ def _sample(self, X, y):
         """
         self._validate_estimator()
 
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-        if self.return_indices:
-            idx_under = np.empty((0, ), dtype=int)
+        idx_under = np.empty((0, ), dtype=int)
 
         target_stats = Counter(y)
         class_minority = min(target_stats, key=target_stats.get)
+        minority_class_indices = np.flatnonzero(y == class_minority)
 
-        self.nn_.fit(X[y == class_minority])
+        self.nn_.fit(safe_indexing(X, minority_class_indices))
 
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
                 n_samples = self.ratio_[target_class]
-                X_class = X[y == target_class]
-                y_class = y[y == target_class]
+                target_class_indices = np.flatnonzero(y == target_class)
+                X_class = safe_indexing(X, target_class_indices)
+                y_class = safe_indexing(y, target_class_indices)
 
                 if self.version == 1:
                     dist_vec, idx_vec = self.nn_.kneighbors(
@@ -288,10 +291,10 @@ def _sample(self, X, y):
                 elif self.version == 3:
                     self.nn_ver3_.fit(X_class)
                     dist_vec, idx_vec = self.nn_ver3_.kneighbors(
-                        X[y == class_minority])
+                        safe_indexing(X, minority_class_indices))
                     idx_vec_farthest = np.unique(idx_vec.reshape(-1))
-                    X_class_selected = X_class[idx_vec_farthest, :]
-                    y_class_selected = y_class[idx_vec_farthest]
+                    X_class_selected = safe_indexing(X_class, idx_vec_farthest)
+                    y_class_selected = safe_indexing(y_class, idx_vec_farthest)
 
                     dist_vec, idx_vec = self.nn_.kneighbors(
                         X_class_selected, n_neighbors=self.nn_.n_neighbors)
@@ -304,18 +307,12 @@ def _sample(self, X, y):
             else:
                 index_target_class = slice(None)
 
-            X_resampled = np.concatenate(
-                (X_resampled, X[y == target_class][index_target_class]),
-                axis=0)
-            y_resampled = np.concatenate(
-                (y_resampled, y[y == target_class][index_target_class]),
-                axis=0)
-            if self.return_indices:
-                idx_under = np.concatenate(
-                    (idx_under, np.flatnonzero(y == target_class)[
-                        index_target_class]), axis=0)
+            idx_under = np.concatenate(
+                (idx_under, np.flatnonzero(y == target_class)[
+                    index_target_class]), axis=0)
 
         if self.return_indices:
-            return X_resampled, y_resampled, idx_under
+            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
+                    idx_under)
         else:
-            return X_resampled, y_resampled
+            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)

From bba7835ce56b3b8118825d6c9dff48373b1d6501 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 13 Aug 2017 19:35:37 +0200
Subject: [PATCH 06/28] EHN support sparse matrices for NCR

---
 .../neighbourhood_cleaning_rule.py               | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
index 45d19e34b..29ac7b6cf 100644
--- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
+++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
@@ -11,6 +11,8 @@
 import numpy as np
 from scipy.stats import mode
 
+from sklearn.utils import safe_indexing
+
 from ..base import BaseCleaningSampler
 from .edited_nearest_neighbours import EditedNearestNeighbours
 from ...utils import check_neighbors_object
@@ -187,8 +189,9 @@ def _sample(self, X, y):
                                     (n_samples > X.shape[0] *
                                      self.threshold_cleaning))]
         self.nn_.fit(X)
-        X_class = X[y == class_minority]
-        y_class = y[y == class_minority]
+        class_minority_indices = y == class_minority
+        X_class = safe_indexing(X, class_minority_indices)
+        y_class = safe_indexing(y, class_minority_indices)
         nnhood_idx = self.nn_.kneighbors(
             X_class, return_distance=False)[:, 1:]
         nnhood_label = y[nnhood_idx]
@@ -210,6 +213,15 @@ def _sample(self, X, y):
         selected_samples[union_a1_a2] = False
         index_target_class = np.flatnonzero(selected_samples)
 
+        if self.return_indices:
+            return (safe_indexing(X, index_target_class),
+                    safe_indexing(y, index_target_class),
+                    index_target_class)
+        else:
+            return (safe_indexing(X, index_target_class),
+                    safe_indexing(y, index_target_class))
+
+
         if self.return_indices:
             return (X[index_target_class], y[index_target_class],
                     index_target_class)

From 9cd917b7b606d980d3cabae92fd97806c7cc8bde Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 13 Aug 2017 20:14:33 +0200
Subject: [PATCH 07/28] EHN support sparse Tomek and OSS

---
 .../neighbourhood_cleaning_rule.py            |  2 +-
 .../one_sided_selection.py                    | 73 +++++++------------
 .../prototype_selection/tomek_links.py        | 10 ++-
 3 files changed, 36 insertions(+), 49 deletions(-)

diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
index 29ac7b6cf..e228d6794 100644
--- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
+++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
@@ -189,7 +189,7 @@ def _sample(self, X, y):
                                     (n_samples > X.shape[0] *
                                      self.threshold_cleaning))]
         self.nn_.fit(X)
-        class_minority_indices = y == class_minority
+        class_minority_indices = np.flatnonzero(y == class_minority)
         X_class = safe_indexing(X, class_minority_indices)
         y_class = safe_indexing(y, class_minority_indices)
         nnhood_idx = self.nn_.kneighbors(
diff --git a/imblearn/under_sampling/prototype_selection/one_sided_selection.py b/imblearn/under_sampling/prototype_selection/one_sided_selection.py
index 1545300a4..698d634aa 100644
--- a/imblearn/under_sampling/prototype_selection/one_sided_selection.py
+++ b/imblearn/under_sampling/prototype_selection/one_sided_selection.py
@@ -10,7 +10,7 @@
 
 import numpy as np
 from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from ..base import BaseCleaningSampler
 from .tomek_links import TomekLinks
@@ -174,10 +174,7 @@ def _sample(self, X, y):
         target_stats = Counter(y)
         class_minority = min(target_stats, key=target_stats.get)
 
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-        if self.return_indices:
-            idx_under = np.empty((0, ), dtype=int)
+        idx_under = np.empty((0, ), dtype=int)
 
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
@@ -186,56 +183,42 @@ def _sample(self, X, y):
                 idx_maj_sample = idx_maj[random_state.randint(
                         low=0, high=target_stats[target_class],
                         size=self.n_seeds_S)]
-                maj_sample = X[idx_maj_sample]
+
+                minority_class_indices = np.flatnonzero(y == class_minority)
+                C_indices = np.append(minority_class_indices, idx_maj_sample)
 
                 # create the set composed of all minority samples and one
                 # sample from the current class.
-                C_x = np.append(X[y == class_minority], maj_sample, axis=0)
-                C_y = np.append(y[y == class_minority], [target_class] *
-                                self.n_seeds_S)
+                C_x = safe_indexing(X, C_indices)
+                C_y = safe_indexing(y, C_indices)
 
                 # create the set S with removing the seed from S
                 # since that it will be added anyway
                 idx_maj_extracted = np.delete(idx_maj, idx_maj_sample, axis=0)
-                S_x = X[idx_maj_extracted]
-                S_y = y[idx_maj_extracted]
+                S_x = safe_indexing(X, idx_maj_extracted)
+                S_y = safe_indexing(y, idx_maj_extracted)
                 self.estimator_.fit(C_x, C_y)
                 pred_S_y = self.estimator_.predict(S_x)
 
-                sel_x = S_x[np.flatnonzero(pred_S_y != S_y), :]
-                sel_y = S_y[np.flatnonzero(pred_S_y != S_y)]
-                if self.return_indices:
-                    idx_tmp = idx_maj_extracted[
-                        np.flatnonzero(pred_S_y != S_y)]
-                    idx_under = np.concatenate(
-                        (idx_under, idx_maj_sample, idx_tmp), axis=0)
-                X_resampled = np.concatenate(
-                    (X_resampled, maj_sample, sel_x), axis=0)
-                y_resampled = np.concatenate(
-                    (y_resampled, [target_class] * self.n_seeds_S, sel_y),
-                    axis=0)
+                S_misclassified_indices = np.flatnonzero(pred_S_y != S_y)
+                idx_tmp = idx_maj_extracted[S_misclassified_indices]
+                idx_under = np.concatenate(
+                    (idx_under, idx_maj_sample, idx_tmp), axis=0)
             else:
-                X_resampled = np.concatenate(
-                    (X_resampled, X[y == target_class]), axis=0)
-                y_resampled = np.concatenate(
-                    (y_resampled, y[y == target_class]), axis=0)
-                if self.return_indices:
-                    idx_under = np.concatenate(
-                        (idx_under, np.flatnonzero(y == target_class)), axis=0)
-
-        # find the nearest neighbour of every point
-        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
-        nn.fit(X_resampled)
-        nns = nn.kneighbors(X_resampled, return_distance=False)[:, 1]
-
-        links = TomekLinks.is_tomek(y_resampled, nns,
-                                    [c for c in np.unique(y)
-                                     if (c != class_minority and
-                                         c in self.ratio_.keys())])
+                idx_under = np.concatenate(
+                    (idx_under, np.flatnonzero(y == target_class)), axis=0)
+
+        X_resampled = safe_indexing(X, idx_under)
+        y_resampled = safe_indexing(y, idx_under)
+
+        # apply Tomek cleaning
+        tl = TomekLinks(ratio='not minority', return_indices=True,
+                        random_state=self.random_state)
+        X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample(X_resampled,
+                                                          y_resampled)
+
+        idx_under = safe_indexing(idx_under, idx_cleaned)
         if self.return_indices:
-            return (X_resampled[np.logical_not(links)],
-                    y_resampled[np.logical_not(links)],
-                    idx_under[np.logical_not(links)])
+            return (X_cleaned, y_cleaned, idx_under)
         else:
-            return (X_resampled[np.logical_not(links)],
-                    y_resampled[np.logical_not(links)])
+            return X_cleaned, y_cleaned
diff --git a/imblearn/under_sampling/prototype_selection/tomek_links.py b/imblearn/under_sampling/prototype_selection/tomek_links.py
index dba47ccb9..8d8a50067 100644
--- a/imblearn/under_sampling/prototype_selection/tomek_links.py
+++ b/imblearn/under_sampling/prototype_selection/tomek_links.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import safe_indexing
 
 from ..base import BaseCleaningSampler
 
@@ -169,9 +170,12 @@ def _sample(self, X, y):
         nns = nn.kneighbors(X, return_distance=False)[:, 1]
 
         links = self.is_tomek(y, nns, self.ratio_)
+        idx_under = np.flatnonzero(np.logical_not(links))
 
         if self.return_indices:
-            return (X[np.logical_not(links)], y[np.logical_not(links)],
-                    np.flatnonzero(np.logical_not(links)))
+            return (safe_indexing(X, idx_under),
+                    safe_indexing(y, idx_under),
+                    idx_under)
         else:
-            return X[np.logical_not(links)], y[np.logical_not(links)]
+            return (safe_indexing(X, idx_under),
+                    safe_indexing(y, idx_under))

From c3ba30752338d0a348cb2b5ac2d4beac1318d8e5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 13 Aug 2017 21:31:23 +0200
Subject: [PATCH 08/28] EHN support sparsity for CNN

---
 .../condensed_nearest_neighbour.py            | 79 ++++++++-----------
 .../neighbourhood_cleaning_rule.py            | 10 +--
 .../one_sided_selection.py                    |  2 +-
 3 files changed, 33 insertions(+), 58 deletions(-)

diff --git a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py
index f7115176f..c3f743647 100644
--- a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py
+++ b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py
@@ -10,8 +10,11 @@
 from collections import Counter
 
 import numpy as np
+
+from scipy.sparse import issparse
+
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from ..base import BaseCleaningSampler
 from ...utils.deprecation import deprecate_parameter
@@ -179,29 +182,27 @@ def _sample(self, X, y):
         random_state = check_random_state(self.random_state)
         target_stats = Counter(y)
         class_minority = min(target_stats, key=target_stats.get)
-
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-        if self.return_indices:
-            idx_under = np.empty((0, ), dtype=int)
+        idx_under = np.empty((0, ), dtype=int)
 
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
                 # Randomly get one sample from the majority class
                 # Generate the index to select
-                idx_maj_sample = random_state.randint(
-                    low=0, high=target_stats[target_class],
-                    size=self.n_seeds_S)
-                maj_sample = X[y == target_class][idx_maj_sample]
+                idx_maj = np.flatnonzero(y == target_class)
+                idx_maj_sample = idx_maj[random_state.randint(
+                        low=0, high=target_stats[target_class],
+                        size=self.n_seeds_S)]
 
                 # Create the set C - One majority samples and all minority
-                C_x = np.append(X[y == class_minority], maj_sample, axis=0)
-                C_y = np.append(y[y == class_minority],
-                                np.array([target_class] * self.n_seeds_S))
+                C_indices = np.append(np.flatnonzero(y == class_minority),
+                                      idx_maj_sample)
+                C_x = safe_indexing(X, C_indices)
+                C_y = safe_indexing(y, C_indices)
 
                 # Create the set S - all majority samples
-                S_x = X[y == target_class]
-                S_y = y[y == target_class]
+                S_indices = np.flatnonzero(y == target_class)
+                S_x = safe_indexing(X, S_indices)
+                S_y = safe_indexing(y, S_indices)
 
                 # fit knn on C
                 self.estimator_.fit(C_x, C_y)
@@ -215,21 +216,21 @@ def _sample(self, X, y):
                         continue
 
                     # Classify on S
-                    pred_y = self.estimator_.predict(x_sam.reshape(1, -1))
+                    if not issparse(x_sam):
+                        x_sam = x_sam.reshape(1, -1)
+                    pred_y = self.estimator_.predict(x_sam)
 
                     # If the prediction do not agree with the true label
                     # append it in C_x
                     if y_sam != pred_y:
                         # Keep the index for later
-                        idx_maj_sample = np.append(idx_maj_sample, idx_sam)
+                        idx_maj_sample = np.append(idx_maj_sample,
+                                                   idx_maj[idx_sam])
 
                         # Update C
-                        C_x = np.append(X[y == class_minority],
-                                        X[y == target_class][idx_maj_sample],
-                                        axis=0)
-                        C_y = np.append(y[y == class_minority],
-                                        np.array([target_class] *
-                                                 idx_maj_sample.size))
+                        C_indices = np.append(C_indices, idx_maj[idx_sam])
+                        C_x = safe_indexing(X, C_indices)
+                        C_y = safe_indexing(y, C_indices)
 
                         # fit a knn on C
                         self.estimator_.fit(C_x, C_y)
@@ -242,32 +243,14 @@ def _sample(self, X, y):
                             np.append(idx_maj_sample,
                                       np.flatnonzero(pred_S_y == S_y)))
 
-                # Find the misclassified S_y
-                sel_x = S_x[idx_maj_sample, :]
-                sel_y = S_y[idx_maj_sample]
-
-                # The indexes found are relative to the current class, we need
-                # to find the absolute value Build the array with the absolute
-                # position
-                abs_pos = np.flatnonzero(y == target_class)
-                idx_maj_sample = abs_pos[idx_maj_sample]
-
-                # If we need to offer support for the indices selected
-                if self.return_indices:
-                    idx_under = np.concatenate((idx_under, idx_maj_sample),
-                                               axis=0)
-                X_resampled = np.concatenate((X_resampled, sel_x), axis=0)
-                y_resampled = np.concatenate((y_resampled, sel_y), axis=0)
+                idx_under = np.concatenate((idx_under, idx_maj_sample),
+                                           axis=0)
             else:
-                X_resampled = np.concatenate(
-                    (X_resampled, X[y == target_class]), axis=0)
-                y_resampled = np.concatenate(
-                    (y_resampled, y[y == target_class]), axis=0)
-                if self.return_indices:
-                    idx_under = np.concatenate(
-                        (idx_under, np.flatnonzero(y == target_class)), axis=0)
+                idx_under = np.concatenate(
+                    (idx_under, np.flatnonzero(y == target_class)), axis=0)
 
         if self.return_indices:
-            return X_resampled, y_resampled, idx_under
+            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
+                    idx_under)
         else:
-            return X_resampled, y_resampled
+            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
index e228d6794..7bd31bb71 100644
--- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
+++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
@@ -15,7 +15,7 @@
 
 from ..base import BaseCleaningSampler
 from .edited_nearest_neighbours import EditedNearestNeighbours
-from ...utils import check_neighbors_object
+from ...utils import check_neighbors_object, check_ratio
 
 SEL_KIND = ('all', 'mode')
 
@@ -168,7 +168,6 @@ def _sample(self, X, y):
 
         """
         self._validate_estimator()
-
         enn = EditedNearestNeighbours(ratio=self.ratio, return_indices=True,
                                       random_state=self.random_state,
                                       size_ngh=self.size_ngh,
@@ -220,10 +219,3 @@ def _sample(self, X, y):
         else:
             return (safe_indexing(X, index_target_class),
                     safe_indexing(y, index_target_class))
-
-
-        if self.return_indices:
-            return (X[index_target_class], y[index_target_class],
-                    index_target_class)
-        else:
-            return X[index_target_class], y[index_target_class]
diff --git a/imblearn/under_sampling/prototype_selection/one_sided_selection.py b/imblearn/under_sampling/prototype_selection/one_sided_selection.py
index 698d634aa..ebba708ab 100644
--- a/imblearn/under_sampling/prototype_selection/one_sided_selection.py
+++ b/imblearn/under_sampling/prototype_selection/one_sided_selection.py
@@ -212,7 +212,7 @@ def _sample(self, X, y):
         y_resampled = safe_indexing(y, idx_under)
 
         # apply Tomek cleaning
-        tl = TomekLinks(ratio='not minority', return_indices=True,
+        tl = TomekLinks(ratio=self.ratio_, return_indices=True,
                         random_state=self.random_state)
         X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample(X_resampled,
                                                           y_resampled)

From d195868e3139b89a0327318eb44403cb03bd1ea9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 13 Aug 2017 23:42:09 +0200
Subject: [PATCH 09/28] EHN support sparse for SMOTE

---
 imblearn/over_sampling/base.py                |  80 +++++------
 imblearn/over_sampling/random_over_sampler.py |  20 ++-
 imblearn/over_sampling/smote.py               | 128 ++++++++++++------
 3 files changed, 138 insertions(+), 90 deletions(-)

diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py
index 175ee4e47..50efc6e74 100644
--- a/imblearn/over_sampling/base.py
+++ b/imblearn/over_sampling/base.py
@@ -19,58 +19,58 @@ class BaseOverSampler(BaseSampler):
 
     _sampling_type = 'over-sampling'
 
-    def fit(self, X, y):
-        """Find the classes statistics before to perform sampling.
+    # def fit(self, X, y):
+    #     """Find the classes statistics before to perform sampling.
 
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Matrix containing the data which have to be sampled.
+    #     Parameters
+    #     ----------
+    #     X : array-like, shape (n_samples, n_features)
+    #         Matrix containing the data which have to be sampled.
 
-        y : array-like, shape (n_samples,)
-            Corresponding label for each sample in X.
+    #     y : array-like, shape (n_samples,)
+    #         Corresponding label for each sample in X.
 
-        Returns
-        -------
-        self : object,
-            Return self.
+    #     Returns
+    #     -------
+    #     self : object,
+    #         Return self.
 
-        Notes
-        -----
-        Over-samplers do not accept sparse matrices.
+    #     Notes
+    #     -----
+    #     Over-samplers do not accept sparse matrices.
 
-        """
-        # over-sampling method does not handle sparse matrix
-        X, y = check_X_y(X, y)
+    #     """
+    #     # over-sampling method does not handle sparse matrix
+    #     X, y = check_X_y(X, y)
 
-        return super(BaseOverSampler, self).fit(X, y)
+    #     return super(BaseOverSampler, self).fit(X, y)
 
-    def sample(self, X, y):
-        """Resample the dataset.
+    # def sample(self, X, y):
+    #     """Resample the dataset.
 
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Matrix containing the data which have to be sampled.
+    #     Parameters
+    #     ----------
+    #     X : array-like, shape (n_samples, n_features)
+    #         Matrix containing the data which have to be sampled.
 
-        y : array-like, shape (n_samples,)
-            Corresponding label for each sample in X.
+    #     y : array-like, shape (n_samples,)
+    #         Corresponding label for each sample in X.
 
-        Returns
-        -------
-        X_resampled : array-like, shape (n_samples_new, n_features)
-            The array containing the resampled data.
+    #     Returns
+    #     -------
+    #     X_resampled : array-like, shape (n_samples_new, n_features)
+    #         The array containing the resampled data.
 
-        y_resampled : array-like, shape (n_samples_new,)
-            The corresponding label of `X_resampled`
+    #     y_resampled : array-like, shape (n_samples_new,)
+    #         The corresponding label of `X_resampled`
 
-        Notes
-        -----
-        Over-samplers do not accept sparse matrices.
+    #     Notes
+    #     -----
+    #     Over-samplers do not accept sparse matrices.
 
-        """
+    #     """
 
-        # Check the consistency of X and y
-        X, y = check_X_y(X, y)
+    #     # Check the consistency of X and y
+    #     X, y = check_X_y(X, y)
 
-        return super(BaseOverSampler, self).sample(X, y)
+    #     return super(BaseOverSampler, self).sample(X, y)
diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py
index 9b164eee7..d70b45d32 100644
--- a/imblearn/over_sampling/random_over_sampler.py
+++ b/imblearn/over_sampling/random_over_sampler.py
@@ -8,7 +8,7 @@
 from collections import Counter
 
 import numpy as np
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from .base import BaseOverSampler
 
@@ -102,19 +102,15 @@ def _sample(self, X, y):
         random_state = check_random_state(self.random_state)
         target_stats = Counter(y)
 
-        X_resampled = X.copy()
-        y_resampled = y.copy()
+        sample_indices = range(X.shape[0])
 
         for class_sample, num_samples in self.ratio_.items():
-            index_samples = random_state.randint(
+            target_class_indices = np.flatnonzero(y == class_sample)
+            indices = random_state.randint(
                 low=0, high=target_stats[class_sample], size=num_samples)
 
-            X_resampled = np.concatenate((X_resampled,
-                                          X[y == class_sample][index_samples]),
-                                         axis=0)
+            sample_indices = np.append(sample_indices,
+                                       target_class_indices[indices])
 
-            y_resampled = np.concatenate((y_resampled,
-                                          y[y == class_sample][index_samples]),
-                                         axis=0)
-
-        return X_resampled, y_resampled
+        return (safe_indexing(X, sample_indices),
+                safe_indexing(y, sample_indices))
diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py
index 7902d178d..c1a8d7477 100644
--- a/imblearn/over_sampling/smote.py
+++ b/imblearn/over_sampling/smote.py
@@ -8,8 +8,11 @@
 from __future__ import division
 
 import numpy as np
+
+from scipy import sparse
+
 from sklearn.svm import SVC
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 
 from .base import BaseOverSampler
 from ..exceptions import raise_isinstance_error
@@ -253,18 +256,34 @@ def _make_samples(self,
 
         """
         random_state = check_random_state(self.random_state)
-        X_new = np.zeros((n_samples, X.shape[1]))
-        samples = random_state.randint(
+        samples_indices = random_state.randint(
             low=0, high=len(nn_num.flatten()), size=n_samples)
         steps = step_size * random_state.uniform(size=n_samples)
-        rows = np.floor_divide(samples, nn_num.shape[1])
-        cols = np.mod(samples, nn_num.shape[1])
-        for i, (sample, row, col, step) in enumerate(zip(samples, rows,
-                                                         cols, steps)):
-            X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])
-        y_new = np.array([y_type] * len(X_new))
+        rows = np.floor_divide(samples_indices, nn_num.shape[1])
+        cols = np.mod(samples_indices, nn_num.shape[1])
+
+        if sparse.issparse(X):
+            row_indices, col_indices, samples = [], [], []
+            for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+                if X[row].nnz:
+                    sample = X[row] - step * (X[row] -
+                                              nn_data[nn_num[row, col]])
+                    row_indices += [i] * len(sample.indices)
+                    col_indices += sample.indices.tolist()
+                    samples += sample.data.tolist()
+        else:
+            X_new = np.zeros((n_samples, X.shape[1]))
+            for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+                X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])
 
-        return X_new, y_new
+        y_new = np.array([y_type] * len(samples_indices))
+
+        if sparse.issparse(X):
+            return (sparse.csr_matrix((samples, (row_indices, col_indices)),
+                                      [len(samples_indices), X.shape[1]]),
+                    y_new)
+        else:
+            return X_new, y_new
 
     def _validate_estimator(self):
         """Create the necessary objects for SMOTE."""
@@ -326,21 +345,26 @@ def _sample_regular(self, X, y):
            intelligence research, 321-357, 2002.
 
         """
+
         X_resampled = X.copy()
         y_resampled = y.copy()
 
         for class_sample, n_samples in self.ratio_.items():
             if n_samples == 0:
                 continue
-            X_class = X[y == class_sample]
+            target_class_indices = np.flatnonzero(y == class_sample)
+            X_class = safe_indexing(X, target_class_indices)
 
             self.nn_k_.fit(X_class)
             nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
             X_new, y_new = self._make_samples(X_class, class_sample, X_class,
                                               nns, n_samples, 1.0)
 
-            X_resampled = np.concatenate((X_resampled, X_new), axis=0)
-            y_resampled = np.concatenate((y_resampled, y_new), axis=0)
+            if sparse.issparse(X_new):
+                X_resampled = sparse.vstack([X_resampled, X_new])
+            else:
+                X_resampled = np.vstack((X_resampled, X_new))
+            y_resampled = np.hstack((y_resampled, y_new))
 
         return X_resampled, y_resampled
 
@@ -381,7 +405,8 @@ def _sample_borderline(self, X, y):
         for class_sample, n_samples in self.ratio_.items():
             if n_samples == 0:
                 continue
-            X_class = X[y == class_sample]
+            target_class_indices = np.flatnonzero(y == class_sample)
+            X_class = safe_indexing(X, target_class_indices)
 
             self.nn_m_.fit(X)
             danger_index = self._in_danger_noise(X_class, class_sample, y,
@@ -391,16 +416,21 @@ def _sample_borderline(self, X, y):
 
             self.nn_k_.fit(X_class)
             nns = self.nn_k_.kneighbors(
-               X_class[danger_index], return_distance=False)[:, 1:]
+                safe_indexing(X_class, danger_index),
+                return_distance=False)[:, 1:]
 
             # divergence between borderline-1 and borderline-2
             if self.kind == 'borderline1':
                 # Create synthetic samples for borderline points.
-                X_new, y_new = self._make_samples(X_class[danger_index],
+                X_new, y_new = self._make_samples(safe_indexing(X_class,
+                                                                danger_index),
                                                   class_sample, X_class,
                                                   nns, n_samples)
-                X_resampled = np.concatenate((X_resampled, X_new), axis=0)
-                y_resampled = np.concatenate((y_resampled, y_new), axis=0)
+                if sparse.issparse(X_new):
+                    X_resampled = sparse.vstack([X_resampled, X_new])
+                else:
+                    X_resampled = np.vstack((X_resampled, X_new))
+                y_resampled = np.hstack((y_resampled, y_new))
 
             else:
                 random_state = check_random_state(self.random_state)
@@ -408,22 +438,26 @@ def _sample_borderline(self, X, y):
 
                 # only minority
                 X_new_1, y_new_1 = self._make_samples(
-                    X_class[danger_index], class_sample, X_class, nns,
+                    safe_indexing(X_class, danger_index), class_sample,
+                    X_class, nns,
                     int(fractions * (n_samples + 1)), step_size=1.)
 
                 # we use a one-vs-rest policy to handle the multiclass in which
                 # new samples will be created considering not only the majority
                 # class but all over classes.
                 X_new_2, y_new_2 = self._make_samples(
-                    X_class[danger_index], class_sample, X[y != class_sample],
+                    safe_indexing(X_class, danger_index), class_sample,
+                    safe_indexing(X, np.flatnonzero(y != class_sample)),
                     nns, int((1 - fractions) * n_samples), step_size=0.5)
 
-                # Concatenate the newly generated samples to the original
-                # data set
-                X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
-                                             axis=0)
-                y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
-                                             axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled,
+                                                 X_new_1, X_new_2])
+                else:
+                    X_resampled = np.vstack((X_resampled,
+                                             X_new_1, X_new_2))
+                y_resampled = np.hstack((y_resampled,
+                                         y_new_1, y_new_2))
 
         return X_resampled, y_resampled
 
@@ -463,17 +497,20 @@ def _sample_svm(self, X, y):
         for class_sample, n_samples in self.ratio_.items():
             if n_samples == 0:
                 continue
-            X_class = X[y == class_sample]
+            target_class_indices = np.flatnonzero(y == class_sample)
+            X_class = safe_indexing(X, target_class_indices)
 
             self.svm_estimator_.fit(X, y)
             support_index = self.svm_estimator_.support_[
                 y[self.svm_estimator_.support_] == class_sample]
-            support_vector = X[support_index]
+            support_vector = safe_indexing(X, support_index)
 
             self.nn_m_.fit(X)
             noise_bool = self._in_danger_noise(support_vector, class_sample, y,
                                                kind='noise')
-            support_vector = support_vector[np.logical_not(noise_bool)]
+            support_vector = safe_indexing(
+                support_vector,
+                np.flatnonzero(np.logical_not(noise_bool)))
             danger_bool = self._in_danger_noise(support_vector, class_sample,
                                                 y, kind='danger')
             safety_bool = np.logical_not(danger_bool)
@@ -481,33 +518,48 @@ def _sample_svm(self, X, y):
             self.nn_k_.fit(X_class)
             fractions = random_state.beta(10, 10)
             if np.count_nonzero(danger_bool) > 0:
-                nns = self.nn_k_.kneighbors(support_vector[danger_bool],
+                nns = self.nn_k_.kneighbors(safe_indexing(
+                    support_vector,
+                    np.flatnonzero(danger_bool)),
                                             return_distance=False)[:, 1:]
 
                 X_new_1, y_new_1 = self._make_samples(
-                    support_vector[danger_bool], class_sample, X_class,
+                    safe_indexing(support_vector, np.flatnonzero(danger_bool)),
+                    class_sample, X_class,
                     nns, int(fractions * (n_samples + 1)), step_size=1.)
 
             if np.count_nonzero(safety_bool) > 0:
-                nns = self.nn_k_.kneighbors(support_vector[safety_bool],
-                                            return_distance=False)[:, 1:]
+                nns = self.nn_k_.kneighbors(
+                    safe_indexing(support_vector, np.flatnonzero(safety_bool)),
+                    return_distance=False)[:, 1:]
 
                 X_new_2, y_new_2 = self._make_samples(
-                    support_vector[safety_bool], class_sample, X_class,
+                    safe_indexing(support_vector, np.flatnonzero(safety_bool)),
+                    class_sample, X_class,
                     nns, int((1 - fractions) * n_samples),
                     step_size=-self.out_step)
 
             if (np.count_nonzero(danger_bool) > 0 and
                     np.count_nonzero(safety_bool) > 0):
-                X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2),
-                                             axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled,
+                                                 X_new_1, X_new_2])
+                else:
+                    X_resampled = np.vstack((X_resampled,
+                                             X_new_1, X_new_2))
                 y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2),
                                              axis=0)
             elif np.count_nonzero(danger_bool) == 0:
-                X_resampled = np.concatenate((X_resampled, X_new_2), axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled,  X_new_2])
+                else:
+                    X_resampled = np.vstack((X_resampled, X_new_2))
                 y_resampled = np.concatenate((y_resampled, y_new_2), axis=0)
             elif np.count_nonzero(safety_bool) == 0:
-                X_resampled = np.concatenate((X_resampled, X_new_1), axis=0)
+                if sparse.issparse(X_resampled):
+                    X_resampled = sparse.vstack([X_resampled, X_new_1])
+                else:
+                    X_resampled = np.vstack((X_resampled, X_new_1))
                 y_resampled = np.concatenate((y_resampled, y_new_1), axis=0)
 
         return X_resampled, y_resampled

From bcf44ab3e3d22a3660ba59e50ec2e7933b8036bb Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 00:12:47 +0200
Subject: [PATCH 10/28] EHN support sparse adasyn

---
 imblearn/over_sampling/adasyn.py            | 69 +++++++++++++++------
 imblearn/over_sampling/tests/test_adasyn.py | 29 +--------
 2 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py
index 3f16d0d53..e5c439b61 100644
--- a/imblearn/over_sampling/adasyn.py
+++ b/imblearn/over_sampling/adasyn.py
@@ -7,7 +7,9 @@
 from __future__ import division
 
 import numpy as np
-from sklearn.utils import check_random_state
+from scipy import sparse
+
+from sklearn.utils import check_random_state, safe_indexing
 
 from .base import BaseOverSampler
 from ..utils import check_neighbors_object
@@ -154,7 +156,8 @@ def _sample(self, X, y):
         for class_sample, n_samples in self.ratio_.items():
             if n_samples == 0:
                 continue
-            X_class = X[y == class_sample]
+            target_class_indices = np.flatnonzero(y == class_sample)
+            X_class = safe_indexing(X, target_class_indices)
 
             self.nn_.fit(X)
             _, nn_index = self.nn_.kneighbors(X_class)
@@ -171,27 +174,57 @@ def _sample(self, X, y):
                                    ' Use SMOTE instead.')
             ratio_nn /= np.sum(ratio_nn)
             n_samples_generate = np.rint(ratio_nn * n_samples).astype(int)
+            if not np.sum(n_samples_generate):
+                raise ValueError("No samples will be generated with the"
+                                 " provided ratio settings.")
 
             # the nearest neighbors need to be fitted only on the current class
             # to find the class NN to generate new samples
             self.nn_.fit(X_class)
             _, nn_index = self.nn_.kneighbors(X_class)
 
-            x_class_gen = []
-            for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index,
-                                                 n_samples_generate):
-                if num_sample_i == 0:
-                    continue
-                nn_zs = random_state.randint(
-                    1, high=self.nn_.n_neighbors, size=num_sample_i)
-                steps = random_state.uniform(size=len(nn_zs))
-                x_class_gen.append([x_i + step * (X[x_i_nn[nn_z], :] - x_i)
-                                    for step, nn_z in zip(steps, nn_zs)])
-
-            if len(x_class_gen) > 0:
-                X_resampled = np.vstack((X_resampled,
-                                         np.concatenate(x_class_gen)))
-                y_resampled = np.hstack((y_resampled, [class_sample] *
-                                         np.sum(n_samples_generate)))
+            if sparse.issparse(X):
+                row_indices, col_indices, samples = [], [], []
+                n_samples_generated = 0
+                for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index,
+                                                     n_samples_generate):
+                    if num_sample_i == 0:
+                        continue
+                    nn_zs = random_state.randint(
+                        1, high=self.nn_.n_neighbors, size=num_sample_i)
+                    steps = random_state.uniform(size=len(nn_zs))
+                    if x_i.nnz:
+                        for step, nn_z in zip(steps, nn_zs):
+                            sample = x_i + step * (X[x_i_nn[nn_z], :] - x_i)
+                            row_indices += ([n_samples_generated] *
+                                            len(sample.indices))
+                            col_indices += sample.indices.tolist()
+                            samples += sample.data.tolist()
+                            n_samples_generated += 1
+                X_new = (sparse.csr_matrix((samples,
+                                            (row_indices, col_indices)),
+                                           [np.sum(n_samples_generate),
+                                            X.shape[1]]))
+                y_new = np.array([class_sample] * np.sum(n_samples_generate))
+            else:
+                x_class_gen = []
+                for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index,
+                                                     n_samples_generate):
+                    if num_sample_i == 0:
+                        continue
+                    nn_zs = random_state.randint(
+                        1, high=self.nn_.n_neighbors, size=num_sample_i)
+                    steps = random_state.uniform(size=len(nn_zs))
+                    x_class_gen.append([x_i + step * (X[x_i_nn[nn_z], :] - x_i)
+                                        for step, nn_z in zip(steps, nn_zs)])
+
+                X_new = np.concatenate(x_class_gen)
+                y_new = np.array([class_sample] * np.sum(n_samples_generate))
+
+            if sparse.issparse(X_new):
+                X_resampled = sparse.vstack([X_resampled, X_new])
+            else:
+                X_resampled = np.vstack((X_resampled, X_new))
+            y_resampled = np.hstack((y_resampled, y_new))
 
         return X_resampled, y_resampled
diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py
index eb68dd06c..663c60af2 100644
--- a/imblearn/over_sampling/tests/test_adasyn.py
+++ b/imblearn/over_sampling/tests/test_adasyn.py
@@ -73,34 +73,11 @@ def test_ada_fit_sample():
     assert_array_equal(y_resampled, y_gt)
 
 
-def test_ada_fit_sample_half():
+def test_ada_fit_ratio_error():
     ratio = 0.8
     ada = ADASYN(ratio=ratio, random_state=RND_SEED)
-    X_resampled, y_resampled = ada.fit_sample(X, Y)
-    X_gt = np.array([[0.11622591, -0.0317206],
-                     [0.77481731, 0.60935141],
-                     [1.25192108, -0.22367336],
-                     [0.53366841, -0.30312976],
-                     [1.52091956, -0.49283504],
-                     [-0.28162401, -2.10400981],
-                     [0.83680821, 1.72827342],
-                     [0.3084254, 0.33299982],
-                     [0.70472253, -0.73309052],
-                     [0.28893132, -0.38761769],
-                     [1.15514042, 0.0129463],
-                     [0.88407872, 0.35454207],
-                     [1.31301027, -0.92648734],
-                     [-1.11515198, -0.93689695],
-                     [-0.18410027, -0.45194484],
-                     [0.9281014, 0.53085498],
-                     [-0.14374509, 0.27370049],
-                     [-0.41635887, -0.38299653],
-                     [0.08711622, 0.93259929],
-                     [1.70580611, -0.11219234]])
-    y_gt = np.array(
-        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
-    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
-    assert_array_equal(y_resampled, y_gt)
+    assert_raises_regex(ValueError, "No samples will be generated.",
+                        ada.fit_sample, X, Y)
 
 
 def test_ada_fit_sample_nn_obj():

From c405aa997c1bc299b7454e6a22cc9d760b216e28 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 00:19:09 +0200
Subject: [PATCH 11/28] EHN support sparsity for sombine methods

---
 imblearn/combine/smote_enn.py   | 2 +-
 imblearn/combine/smote_tomek.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
index 32ce1e49d..7a779e127 100644
--- a/imblearn/combine/smote_enn.py
+++ b/imblearn/combine/smote_enn.py
@@ -293,7 +293,7 @@ def fit(self, X, y):
             Return self.
 
         """
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
         y = check_target_type(y)
         self.ratio_ = self.ratio
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
index 08c9f20fd..af55f8b4f 100644
--- a/imblearn/combine/smote_tomek.py
+++ b/imblearn/combine/smote_tomek.py
@@ -244,7 +244,7 @@ def fit(self, X, y):
             Return self.
 
         """
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
         y = check_target_type(y)
         self.ratio_ = self.ratio
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)

From 79637d78ec439ea69161320a80c96ad72bb04b0d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 01:48:22 +0200
Subject: [PATCH 12/28] EHN support sparsity BC

---
 imblearn/ensemble/balance_cascade.py | 60 ++++++++++------------------
 1 file changed, 21 insertions(+), 39 deletions(-)

diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py
index f88c873ed..8adda9004 100644
--- a/imblearn/ensemble/balance_cascade.py
+++ b/imblearn/ensemble/balance_cascade.py
@@ -12,7 +12,7 @@
 
 from sklearn.base import ClassifierMixin
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.utils import check_random_state
+from sklearn.utils import check_random_state, safe_indexing
 from sklearn.externals.six import string_types
 from sklearn.model_selection import cross_val_predict
 
@@ -249,22 +249,16 @@ def _sample(self, X, y):
         samples_mask = np.ones(y.shape, dtype=bool)
 
         # where the different set will be stored
-        X_resampled = []
-        y_resampled = []
         idx_under = []
 
         n_subsets = 0
         b_subset_search = True
         while b_subset_search:
-            target_stats = Counter(y[samples_mask])
-            # build the data set to be classified
-            X_subset = np.empty((0, X.shape[1]), dtype=X.dtype)
-            y_subset = np.empty((0, ), dtype=y.dtype)
+            target_stats = Counter(safe_indexing(
+                y, np.flatnonzero(samples_mask)))
             # store the index of the data to under-sample
             index_under_sample = np.empty((0, ), dtype=y.dtype)
             # value which will be picked at each round
-            X_constant = np.empty((0, X.shape[1]), dtype=X.dtype)
-            y_constant = np.empty((0, ), dtype=y.dtype)
             index_constant = np.empty((0, ), dtype=y.dtype)
             for target_class in target_stats.keys():
                 if target_class in self.ratio_.keys():
@@ -274,29 +268,15 @@ def _sample(self, X, y):
                     index_class = np.flatnonzero(y == target_class)
                     index_class_interest = index_class[samples_mask[
                         y == target_class]]
-                    X_class = X[index_class_interest]
-                    y_class = y[index_class_interest]
+                    y_class = safe_indexing(y, index_class_interest)
                     # select randomly the desired features
                     index_target_class = random_state.choice(
                         range(y_class.size), size=n_samples, replace=False)
-                    X_subset = np.concatenate((X_subset,
-                                               X_class[index_target_class]),
-                                              axis=0)
-                    y_subset = np.concatenate((y_subset,
-                                               y_class[index_target_class]),
-                                              axis=0)
-                    # index of the data
                     index_under_sample = np.concatenate(
                         (index_under_sample,
                          index_class_interest[index_target_class]),
                         axis=0)
                 else:
-                    X_constant = np.concatenate((X_constant,
-                                                 X[y == target_class]),
-                                                axis=0)
-                    y_constant = np.concatenate((y_constant,
-                                                y[y == target_class]),
-                                                axis=0)
                     index_constant = np.concatenate(
                         (index_constant,
                          np.flatnonzero(y == target_class)),
@@ -304,23 +284,18 @@ def _sample(self, X, y):
 
             # store the set created
             n_subsets += 1
-            X_resampled.append(np.concatenate((X_subset, X_constant),
-                                              axis=0))
-            y_resampled.append(np.concatenate((y_subset, y_constant),
-                                              axis=0))
-            idx_under.append(np.concatenate((index_under_sample,
-                                             index_constant),
-                                            axis=0))
+            subset_indices = np.concatenate((index_under_sample,
+                                             index_constant), axis=0)
+            idx_under.append(subset_indices)
 
             # fit and predict using cross validation
-            pred = cross_val_predict(self.estimator_,
-                                     np.concatenate((X_subset, X_constant),
-                                                    axis=0),
-                                     np.concatenate((y_subset, y_constant),
-                                                    axis=0))
+            X_subset = safe_indexing(X, subset_indices)
+            y_subset = safe_indexing(y, subset_indices)
+            pred = cross_val_predict(self.estimator_, X_subset, y_subset)
             # extract the prediction about the targeted classes only
-            pred_target = pred[:y_subset.size]
-            index_classified = index_under_sample[pred_target == y_subset]
+            pred_target = pred[:index_under_sample.size]
+            index_classified = index_under_sample[
+                pred_target == y_subset[:index_under_sample.size]]
             samples_mask[index_classified] = False
 
             # check the stopping criterion
@@ -328,11 +303,18 @@ def _sample(self, X, y):
                 if n_subsets == self.n_max_subset:
                     b_subset_search = False
             # check that there is enough samples for another round
-            target_stats = Counter(y[samples_mask])
+            target_stats = Counter(safe_indexing(
+                y, np.flatnonzero(samples_mask)))
             for target_class in self.ratio_.keys():
+                print(target_stats[target_class], self.ratio_[target_class])
                 if target_stats[target_class] < self.ratio_[target_class]:
                     b_subset_search = False
 
+        X_resampled, y_resampled = [], []
+        for indices in idx_under:
+            X_resampled.append(safe_indexing(X, indices))
+            y_resampled.append(safe_indexing(y, indices))
+
         if self.return_indices:
             return (np.array(X_resampled), np.array(y_resampled),
                     np.array(idx_under))

From c199af9b8575203a93a62d007e73e514b7745a69 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 02:47:00 +0200
Subject: [PATCH 13/28] DOC update docstring

---
 imblearn/base.py                              | 25 +++++----
 imblearn/combine/smote_enn.py                 | 11 ++--
 imblearn/combine/smote_tomek.py               | 13 +++--
 imblearn/ensemble/balance_cascade.py          | 12 ++--
 imblearn/ensemble/easy_ensemble.py            |  7 ++-
 imblearn/over_sampling/adasyn.py              | 10 ++--
 imblearn/over_sampling/base.py                | 56 -------------------
 imblearn/over_sampling/random_over_sampler.py |  9 +--
 imblearn/over_sampling/smote.py               | 54 +++++++++---------
 .../prototype_generation/cluster_centroids.py | 41 ++++++++------
 .../condensed_nearest_neighbour.py            |  9 +--
 .../edited_nearest_neighbours.py              | 27 +++++----
 .../instance_hardness_threshold.py            |  9 +--
 .../prototype_selection/nearmiss.py           | 21 +++----
 .../neighbourhood_cleaning_rule.py            |  9 +--
 .../random_under_sampler.py                   | 11 ++--
 .../prototype_selection/tomek_links.py        |  9 +--
 17 files changed, 150 insertions(+), 183 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 08b1b6adf..05d79f35a 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -38,7 +38,7 @@ def sample(self, X, y):
 
         Parameters
         ----------
-        X :  {array-like, sparse matrix}, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
         y : array-like, shape (n_samples,)
@@ -46,11 +46,11 @@ def sample(self, X, y):
 
         Returns
         -------
-        X_resampled : {array-like, sparse matrix}, shape \
+        X_resampled : {ndarray, sparse matrix}, shape \
 (n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : array-like, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new)
             The corresponding label of `X_resampled`
 
         """
@@ -68,18 +68,19 @@ def fit_sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples,)
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {array-like, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new,)
+        y_resampled : array-like, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         """
@@ -92,19 +93,21 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
+
         """
         pass
 
diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
index 7a779e127..e1e094c32 100644
--- a/imblearn/combine/smote_enn.py
+++ b/imblearn/combine/smote_enn.py
@@ -281,10 +281,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
@@ -305,15 +305,16 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
         y_resampled : ndarray, shape (n_samples_new)
diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
index af55f8b4f..82821df0c 100644
--- a/imblearn/combine/smote_tomek.py
+++ b/imblearn/combine/smote_tomek.py
@@ -232,10 +232,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
@@ -256,18 +256,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         """
diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py
index 8adda9004..274b86759 100644
--- a/imblearn/ensemble/balance_cascade.py
+++ b/imblearn/ensemble/balance_cascade.py
@@ -149,10 +149,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
@@ -222,15 +222,16 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_subset, n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_subset, n_samples_new, n_features)
             The array containing the resampled data.
 
         y_resampled : ndarray, shape (n_subset, n_samples_new)
@@ -306,7 +307,6 @@ def _sample(self, X, y):
             target_stats = Counter(safe_indexing(
                 y, np.flatnonzero(samples_mask)))
             for target_class in self.ratio_.keys():
-                print(target_stats[target_class], self.ratio_[target_class])
                 if target_stats[target_class] < self.ratio_[target_class]:
                     b_subset_search = False
 
diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py
index 9a3fff860..5fc018167 100644
--- a/imblearn/ensemble/easy_ensemble.py
+++ b/imblearn/ensemble/easy_ensemble.py
@@ -112,15 +112,16 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_subset, n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_subset, n_samples_new, n_features)
             The array containing the resampled data.
 
         y_resampled : ndarray, shape (n_subset, n_samples_new)
diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py
index e5c439b61..e15bfa62b 100644
--- a/imblearn/over_sampling/adasyn.py
+++ b/imblearn/over_sampling/adasyn.py
@@ -132,20 +132,22 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
+
         """
         self._validate_estimator()
         random_state = check_random_state(self.random_state)
diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py
index 50efc6e74..883fd9be2 100644
--- a/imblearn/over_sampling/base.py
+++ b/imblearn/over_sampling/base.py
@@ -18,59 +18,3 @@ class BaseOverSampler(BaseSampler):
     """
 
     _sampling_type = 'over-sampling'
-
-    # def fit(self, X, y):
-    #     """Find the classes statistics before to perform sampling.
-
-    #     Parameters
-    #     ----------
-    #     X : array-like, shape (n_samples, n_features)
-    #         Matrix containing the data which have to be sampled.
-
-    #     y : array-like, shape (n_samples,)
-    #         Corresponding label for each sample in X.
-
-    #     Returns
-    #     -------
-    #     self : object,
-    #         Return self.
-
-    #     Notes
-    #     -----
-    #     Over-samplers do not accept sparse matrices.
-
-    #     """
-    #     # over-sampling method does not handle sparse matrix
-    #     X, y = check_X_y(X, y)
-
-    #     return super(BaseOverSampler, self).fit(X, y)
-
-    # def sample(self, X, y):
-    #     """Resample the dataset.
-
-    #     Parameters
-    #     ----------
-    #     X : array-like, shape (n_samples, n_features)
-    #         Matrix containing the data which have to be sampled.
-
-    #     y : array-like, shape (n_samples,)
-    #         Corresponding label for each sample in X.
-
-    #     Returns
-    #     -------
-    #     X_resampled : array-like, shape (n_samples_new, n_features)
-    #         The array containing the resampled data.
-
-    #     y_resampled : array-like, shape (n_samples_new,)
-    #         The corresponding label of `X_resampled`
-
-    #     Notes
-    #     -----
-    #     Over-samplers do not accept sparse matrices.
-
-    #     """
-
-    #     # Check the consistency of X and y
-    #     X, y = check_X_y(X, y)
-
-    #     return super(BaseOverSampler, self).sample(X, y)
diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py
index d70b45d32..271f1f6e8 100644
--- a/imblearn/over_sampling/random_over_sampler.py
+++ b/imblearn/over_sampling/random_over_sampler.py
@@ -84,18 +84,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         """
diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py
index c1a8d7477..fabe63b42 100644
--- a/imblearn/over_sampling/smote.py
+++ b/imblearn/over_sampling/smote.py
@@ -178,13 +178,13 @@ def _in_danger_noise(self, samples, target_class, y, kind='danger'):
 
         Parameters
         ----------
-        samples : ndarray, shape (n_samples, n_features)
+        samples : {array-like, sparse matrix}, shape (n_samples, n_features)
             The samples to check if either they are in danger or not.
 
         target_class : int or str,
             The target corresponding class being over-sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             The true label in order to check the neighbour labels.
 
         kind : str, optional (default='danger')
@@ -195,7 +195,7 @@ def _in_danger_noise(self, samples, target_class, y, kind='danger'):
 
         Returns
         -------
-        output : ndarray, shape (n_samples, )
+        output : ndarray, shape (n_samples,)
             A boolean array where True refer to samples in danger or noise.
 
         """
@@ -226,7 +226,7 @@ def _make_samples(self,
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Points from which the points will be created.
 
         y_type : str or int
@@ -248,10 +248,10 @@ def _make_samples(self,
 
         Returns
         -------
-        X_new : ndarray, shape (n_samples_new, n_features)
+        X_new : {ndarray, sparse matrix}, shape (n_samples_new, n_features)
             Synthetically generated samples.
 
-        y_new : ndarray, shape (n_samples_new, )
+        y_new : ndarray, shape (n_samples_new,)
             Target values for synthetic samples.
 
         """
@@ -324,19 +324,20 @@ def _sample_regular(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
-            The corresponding label of `X_resampled`.
+        y_resampled : ndarray, shape (n_samples_new,)
+            The corresponding label of `X_resampled`
 
         References
         ----------
@@ -378,19 +379,20 @@ def _sample_borderline(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
-            The corresponding label of `X_resampled`.
+        y_resampled : ndarray, shape (n_samples_new,)
+            The corresponding label of `X_resampled`
 
         References
         ----------
@@ -469,19 +471,20 @@ def _sample_svm(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
-            The corresponding label of `X_resampled`.
+        y_resampled : ndarray, shape (n_samples_new,)
+            The corresponding label of `X_resampled`
 
         References
         ----------
@@ -569,18 +572,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         """
diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
index 0eef20cde..50eb14181 100644
--- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py
+++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
@@ -9,7 +9,10 @@
 from __future__ import division, print_function
 
 import numpy as np
+from scipy import sparse
+
 from sklearn.cluster import KMeans
+from sklearn.utils import safe_indexing
 
 from ..base import BaseUnderSampler
 
@@ -109,42 +112,46 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         """
         self._validate_estimator()
 
-        X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype)
-        y_resampled = np.empty((0, ), dtype=y.dtype)
-
+        idx_under = np.empty((0, ), dtype=int)
+        centroids, y_resampled = [], []
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
                 n_samples = self.ratio_[target_class]
                 self.estimator_.set_params(**{'n_clusters': n_samples})
                 self.estimator_.fit(X[y == target_class])
-                centroids = self.estimator_.cluster_centers_
+                centroids.append(self.estimator_.cluster_centers_)
+                y_resampled += [target_class] * n_samples
 
-                X_resampled = np.concatenate((X_resampled, centroids), axis=0)
-                y_resampled = np.concatenate(
-                    (y_resampled, np.array([target_class] * n_samples)),
-                    axis=0)
             else:
+                target_class_indices = np.flatnonzero(y == target_class)
+                idx_under = np.concatenate(
+                    (idx_under, target_class_indices), axis=0)
+                y_resampled += [target_class] * target_class_indices.size
 
-                X_resampled = np.concatenate(
-                    (X_resampled, X[y == target_class]), axis=0)
-                y_resampled = np.concatenate(
-                    (y_resampled, y[y == target_class]), axis=0)
+        X_resampled = np.concatenate((centroids))
+
+        if sparse.issparse(X):
+            X_resampled = sparse.vstack([sparse.csr_matrix(X_resampled),
+                                         safe_indexing(X, idx_under)])
+        else:
+            X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under)))
 
-        return X_resampled, y_resampled
+        return X_resampled, np.array(y_resampled)
diff --git a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py
index c3f743647..1d03eba9a 100644
--- a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py
+++ b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py
@@ -158,18 +158,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
index a20a18b70..87c4cb250 100644
--- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
+++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py
@@ -161,18 +161,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
@@ -362,18 +363,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
@@ -585,18 +587,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
diff --git a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py
index a7f8199f5..637323164 100644
--- a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py
+++ b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py
@@ -204,18 +204,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
diff --git a/imblearn/under_sampling/prototype_selection/nearmiss.py b/imblearn/under_sampling/prototype_selection/nearmiss.py
index 3475d3987..4a5475317 100644
--- a/imblearn/under_sampling/prototype_selection/nearmiss.py
+++ b/imblearn/under_sampling/prototype_selection/nearmiss.py
@@ -156,10 +156,10 @@ def _selection_dist_based(self,
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Original samples.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Associated label to X.
 
         dist_vec : ndarray, shape (n_samples, )
@@ -176,13 +176,7 @@ def _selection_dist_based(self,
 
         Returns
         -------
-        X_sel : ndarray, shape (num_samples, n_features)
-            Selected samples.
-
-        y_sel : ndarray, shape (num_samples, )
-            The associated label.
-
-        idx_sel : ndarray, shape (num_samples, )
+        idx_sel : ndarray, shape (num_samples,)
             The list of the indices of the selected samples.
 
         """
@@ -247,18 +241,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
index 7bd31bb71..e9f16e6a8 100644
--- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
+++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py
@@ -148,18 +148,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
index faeb0c9f2..e7a209fdd 100644
--- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py
+++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
@@ -94,18 +94,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
-            Matrix containing the data to be sampled.
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )
diff --git a/imblearn/under_sampling/prototype_selection/tomek_links.py b/imblearn/under_sampling/prototype_selection/tomek_links.py
index 8d8a50067..91b99f03b 100644
--- a/imblearn/under_sampling/prototype_selection/tomek_links.py
+++ b/imblearn/under_sampling/prototype_selection/tomek_links.py
@@ -144,18 +144,19 @@ def _sample(self, X, y):
 
         Parameters
         ----------
-        X : ndarray, shape (n_samples, n_features)
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Matrix containing the data which have to be sampled.
 
-        y : ndarray, shape (n_samples, )
+        y : array-like, shape (n_samples,)
             Corresponding label for each sample in X.
 
         Returns
         -------
-        X_resampled : ndarray, shape (n_samples_new, n_features)
+        X_resampled : {ndarray, sparse matrix}, shape \
+(n_samples_new, n_features)
             The array containing the resampled data.
 
-        y_resampled : ndarray, shape (n_samples_new)
+        y_resampled : ndarray, shape (n_samples_new,)
             The corresponding label of `X_resampled`
 
         idx_under : ndarray, shape (n_samples, )

From 425928f3c0052b8a25f0d2049396a4959aac1728 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 02:57:46 +0200
Subject: [PATCH 14/28] DOC fix example topic classification

---
 examples/applications/plot_topic_classication.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/examples/applications/plot_topic_classication.py b/examples/applications/plot_topic_classication.py
index 90e48f0c3..e0af19ccf 100644
--- a/examples/applications/plot_topic_classication.py
+++ b/examples/applications/plot_topic_classication.py
@@ -16,7 +16,6 @@
 from collections import Counter
 
 from sklearn.datasets import fetch_20newsgroups
-from sklearn.preprocessing import FunctionTransformer
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import make_pipeline
@@ -82,22 +81,10 @@
 # use a ``RandomUnderSampler`` to equalize the number of samples in all the
 # classes before the training.
 #
-# Currently, imbalanced-learn does not handle sparse matrices --- we are
-# currently working on bringing this feature --- and an additional transformer
-# to convert the sparse to dense matrices is required in the pipeline.
-#
 # It is also important to note that we are using the ``make_pipeline`` function
 # implemented in imbalanced-learn to properly handle the samplers.
 
-
-def densify(X):
-    """Function to densify an array."""
-    return X.toarray()
-
-
 pipe = make_pipeline_imb(TfidfVectorizer(),
-                         FunctionTransformer(func=densify,
-                                             accept_sparse=True),
                          RandomUnderSampler(),
                          MultinomialNB())
 

From 4ba8c4e93b476ac41164d2b0164bc0523cefe295 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 12:31:17 +0200
Subject: [PATCH 15/28] FIX fix test and class clustercentroids

---
 .../prototype_generation/cluster_centroids.py |  2 +-
 .../tests/test_cluster_centroids.py           | 38 ++++++++++++-------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
index 50eb14181..5404ea892 100644
--- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py
+++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
@@ -144,7 +144,6 @@ def _sample(self, X, y):
                 target_class_indices = np.flatnonzero(y == target_class)
                 idx_under = np.concatenate(
                     (idx_under, target_class_indices), axis=0)
-                y_resampled += [target_class] * target_class_indices.size
 
         X_resampled = np.concatenate((centroids))
 
@@ -153,5 +152,6 @@ def _sample(self, X, y):
                                          safe_indexing(X, idx_under)])
         else:
             X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under)))
+        y_resampled = np.hstack((y_resampled, safe_indexing(y, idx_under)))
 
         return X_resampled, np.array(y_resampled)
diff --git a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py
index f3d73e67a..d94a8070b 100644
--- a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py
+++ b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py
@@ -24,10 +24,13 @@ def test_fit_sample_auto():
     ratio = 'auto'
     cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
     X_resampled, y_resampled = cc.fit_sample(X, Y)
-    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
-                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
-                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
-    y_gt = np.array([0, 0, 0, 1, 1, 1])
+    X_gt = np.array([[0.06738818, -0.529627],
+                     [0.17901516, 0.69860992],
+                     [0.094035, -2.55298982],
+                     [0.92923648, 0.76103773],
+                     [0.47104475, 0.44386323],
+                     [0.13347175, 0.12167502]])
+    y_gt = np.array([1, 1, 1, 0, 0, 0])
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
 
@@ -36,12 +39,16 @@ def test_fit_sample_half():
     ratio = .5
     cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
     X_resampled, y_resampled = cc.fit_sample(X, Y)
-    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
-                     [0.13347175, 0.12167502], [0.09125309, -0.85409574],
-                     [0.19220316, 0.32337101], [0.094035, -2.55298982],
-                     [0.20792588, 1.49407907], [0.04352327, -0.20515826],
-                     [0.12372842, 0.6536186]])
-    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
+    X_gt = np.array([[0.09125309, -0.85409574],
+                     [0.19220316, 0.32337101],
+                     [0.094035, -2.55298982],
+                     [0.20792588, 1.49407907],
+                     [0.04352327, -0.20515826],
+                     [0.12372842, 0.6536186],
+                     [0.92923648, 0.76103773],
+                     [0.47104475, 0.44386323],
+                     [0.13347175, 0.12167502]])
+    y_gt = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0])
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
 
@@ -65,10 +72,13 @@ def test_fit_sample_object():
         ratio=ratio, random_state=RND_SEED, estimator=cluster)
 
     X_resampled, y_resampled = cc.fit_sample(X, Y)
-    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
-                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
-                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
-    y_gt = np.array([0, 0, 0, 1, 1, 1])
+    X_gt = np.array([[0.06738818, -0.529627],
+                     [0.17901516, 0.69860992],
+                     [0.094035, -2.55298982],
+                     [0.92923648, 0.76103773],
+                     [0.47104475, 0.44386323],
+                     [0.13347175, 0.12167502]])
+    y_gt = np.array([1, 1, 1, 0, 0, 0])
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
 

From 8298fdce2c725aaaf7b9466d6ece9b49e73c1b80 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 13:02:21 +0200
Subject: [PATCH 16/28] TST add common test

---
 .../prototype_generation/cluster_centroids.py |  3 +-
 imblearn/utils/estimator_checks.py            | 42 ++++++++++++++++++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
index 5404ea892..0cfebb193 100644
--- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py
+++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
@@ -82,7 +82,8 @@ class ClusterCentroids(BaseUnderSampler):
     >>> cc = ClusterCentroids(random_state=42)
     >>> X_res, y_res = cc.fit_sample(X, y)
     >>> print('Resampled dataset shape {}'.format(Counter(y_res)))
-    Resampled dataset shape Counter({0: 100, 1: 100})
+    ... # doctest: +ELLIPSIS
+    Resampled dataset shape Counter({...})
 
     """
 
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index cbc223f13..1bfb86701 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -12,6 +12,7 @@
 from collections import Counter
 
 import numpy as np
+from scipy import sparse
 
 from sklearn.datasets import make_classification
 from sklearn.utils.estimator_checks import _yield_all_checks \
@@ -20,7 +21,8 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import (assert_warns, assert_raises_regex,
                                    assert_true, set_random_state,
-                                   assert_equal)
+                                   assert_equal, assert_allclose_dense_sparse,
+                                   SkipTest)
 
 from imblearn.base import SamplerMixin
 from imblearn.over_sampling.base import BaseOverSampler
@@ -36,6 +38,8 @@ def _yield_sampler_checks(name, Estimator):
     yield check_samplers_fit
     yield check_samplers_fit_sample
     yield check_samplers_ratio_fit_sample
+    yield check_samplers_sparse
+    yield check_samplers_pandas
 
 
 def _yield_all_checks(name, Estimator):
@@ -253,3 +257,39 @@ def check_samplers_ratio_fit_sample(name, Sampler):
         X_res, y_res = sampler.fit_sample(X, y)
         y_ensemble = y_res[0]
         assert_equal(target_stats[1], Counter(y_ensemble)[1])
+
+
+def check_samplers_sparse(name, Sampler):
+    # check that sparse matrices can be passed through the sampler leading to
+    # the same results than dense
+    X, y = make_classification(n_samples=1000, n_classes=3,
+                               n_informative=4, weights=[0.2, 0.3, 0.5],
+                               random_state=0)
+    X_sparse = sparse.csr_matrix(X)
+    sampler = Sampler(random_state=0)
+    if not isinstance(sampler, BaseEnsembleSampler):
+        X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y)
+        assert_true(sparse.issparse(X_res_sparse))
+        X_res, y_res = sampler.fit_sample(X, y)
+        assert_allclose_dense_sparse(X_res_sparse.A, X_res)
+        assert_allclose_dense_sparse(y_res_sparse, y_res)
+
+
+def check_samplers_pandas(name, Sampler):
+    # Check that the samplers handle pandas dataframe and pandas series
+    X, y = make_classification(n_samples=1000, n_classes=3,
+                               n_informative=4, weights=[0.2, 0.3, 0.5],
+                               random_state=0)
+    try:
+        import pandas as pd
+        X_pd, y_pd = pd.DataFrame(X), pd.Series(y)
+        sampler = Sampler(random_state=0)
+        X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd)
+        X_res, y_res = sampler.fit_sample(X, y)
+        assert_allclose_dense_sparse(X_res_pd, X_res)
+        assert_allclose_dense_sparse(y_res_pd, y_res)
+
+    except ImportError:
+            raise SkipTest("pandas is not installed: not testing for "
+                           "input of type pandas.DataFrame / pandas.Series as"
+                           " input.")

From e4c6ebbb0cc9103e0bbde326a45a12772ab9fda6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 13:33:32 +0200
Subject: [PATCH 17/28] TST add ensemble

---
 appveyor.yml                       | 2 +-
 build_tools/travis/install.sh      | 2 +-
 imblearn/utils/estimator_checks.py | 9 +++++++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 5616316e5..b1c62ffe8 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -36,7 +36,7 @@ install:
   - "python -c \"import struct; print(struct.calcsize('P') * 8)\""
 
   # Installed prebuilt dependencies from conda
-  - "conda install pip numpy scipy scikit-learn=0.19.0 nose wheel matplotlib -y -q"
+  - "conda install pip numpy scipy scikit-learn=0.19.0 pandas nose wheel matplotlib -y -q"
 
   # Install other nilearn dependencies
   - "pip install coverage nose-timer"
diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index 843aa5088..1179ddaf9 100755
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -38,7 +38,7 @@ if [[ "$DISTRIB" == "conda" ]]; then
     # provided versions
     conda create -n testenv --yes python=$PYTHON_VERSION pip
     source activate testenv
-    conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION
+    conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION pandas
 
     if [[ "$SKLEARN_VERSION" == "master" ]]; then
         conda install --yes cython
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 1bfb86701..91b78670e 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -267,12 +267,17 @@ def check_samplers_sparse(name, Sampler):
                                random_state=0)
     X_sparse = sparse.csr_matrix(X)
     sampler = Sampler(random_state=0)
+    X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y)
+    X_res, y_res = sampler.fit_sample(X, y)
     if not isinstance(sampler, BaseEnsembleSampler):
-        X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y)
         assert_true(sparse.issparse(X_res_sparse))
-        X_res, y_res = sampler.fit_sample(X, y)
         assert_allclose_dense_sparse(X_res_sparse.A, X_res)
         assert_allclose_dense_sparse(y_res_sparse, y_res)
+    else:
+        for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res):
+            assert_true(sparse.issparse(x_sp))
+            assert_allclose_dense_sparse(x_sp.A, x, rtol=1e-06, atol=1e-06)
+            assert_allclose_dense_sparse(y_sp, y)
 
 
 def check_samplers_pandas(name, Sampler):

From 1226a91c5951a3e63d603a4f6d2ae77f0de1ad76 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 13:46:29 +0200
Subject: [PATCH 18/28] TST use allclose

---
 imblearn/utils/estimator_checks.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 91b78670e..9d218e5a0 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -21,7 +21,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import (assert_warns, assert_raises_regex,
                                    assert_true, set_random_state,
-                                   assert_equal, assert_allclose_dense_sparse,
+                                   assert_equal, assert_allclose,
                                    SkipTest)
 
 from imblearn.base import SamplerMixin
@@ -271,13 +271,13 @@ def check_samplers_sparse(name, Sampler):
     X_res, y_res = sampler.fit_sample(X, y)
     if not isinstance(sampler, BaseEnsembleSampler):
         assert_true(sparse.issparse(X_res_sparse))
-        assert_allclose_dense_sparse(X_res_sparse.A, X_res)
-        assert_allclose_dense_sparse(y_res_sparse, y_res)
+        assert_allclose(X_res_sparse.A, X_res)
+        assert_allclose(y_res_sparse, y_res)
     else:
         for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res):
             assert_true(sparse.issparse(x_sp))
-            assert_allclose_dense_sparse(x_sp.A, x, rtol=1e-06, atol=1e-06)
-            assert_allclose_dense_sparse(y_sp, y)
+            assert_allclose(x_sp.A, x, rtol=1e-06, atol=1e-06)
+            assert_allclose(y_sp, y)
 
 
 def check_samplers_pandas(name, Sampler):
@@ -291,8 +291,8 @@ def check_samplers_pandas(name, Sampler):
         sampler = Sampler(random_state=0)
         X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd)
         X_res, y_res = sampler.fit_sample(X, y)
-        assert_allclose_dense_sparse(X_res_pd, X_res)
-        assert_allclose_dense_sparse(y_res_pd, y_res)
+        assert_allclose(X_res_pd, X_res)
+        assert_allclose(y_res_pd, y_res)
 
     except ImportError:
             raise SkipTest("pandas is not installed: not testing for "

From 68b16b5a35f643d061e25041768a852df9828749 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 13:48:27 +0200
Subject: [PATCH 19/28] TST install conda with ubuntu container

---
 build_tools/travis/install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index 1179ddaf9..2b590e860 100755
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -59,7 +59,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then
     # Create a new virtualenv using system site packages for python, numpy
     virtualenv --system-site-packages testvenv
     source testvenv/bin/activate
-    pip install scikit-learn nose nose-timer pytest pytest-cov codecov
+    pip install scikit-learn pandas nose nose-timer pytest pytest-cov codecov
 
 fi
 

From 35c638bd3a2a0b11d169bd5d9980d2b6c239fac6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 15:22:25 +0200
Subject: [PATCH 20/28] TST increase tolerance

---
 imblearn/utils/estimator_checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 9d218e5a0..ede06c658 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -271,7 +271,7 @@ def check_samplers_sparse(name, Sampler):
     X_res, y_res = sampler.fit_sample(X, y)
     if not isinstance(sampler, BaseEnsembleSampler):
         assert_true(sparse.issparse(X_res_sparse))
-        assert_allclose(X_res_sparse.A, X_res)
+        assert_allclose(X_res_sparse.A, X_res, rtol=1e-06, atol=1e-06)
         assert_allclose(y_res_sparse, y_res)
     else:
         for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res):

From 004f9203111848481912203d0c504009680ebf96 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 15:31:21 +0200
Subject: [PATCH 21/28] TST increase tolerance

---
 imblearn/utils/estimator_checks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index ede06c658..ce17b6d8d 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -271,12 +271,12 @@ def check_samplers_sparse(name, Sampler):
     X_res, y_res = sampler.fit_sample(X, y)
     if not isinstance(sampler, BaseEnsembleSampler):
         assert_true(sparse.issparse(X_res_sparse))
-        assert_allclose(X_res_sparse.A, X_res, rtol=1e-06, atol=1e-06)
+        assert_allclose(X_res_sparse.A, X_res, rtol=1e-05, atol=1e-05)
         assert_allclose(y_res_sparse, y_res)
     else:
         for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res):
             assert_true(sparse.issparse(x_sp))
-            assert_allclose(x_sp.A, x, rtol=1e-06, atol=1e-06)
+            assert_allclose(x_sp.A, x, rtol=1e-05, atol=1e-05)
             assert_allclose(y_sp, y)
 
 

From d3ceb5a2c946f0b6063af70ddce94e2a2368e6bf Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 16:00:10 +0200
Subject: [PATCH 22/28] TST test all versions NearMiss and SMOTE

---
 imblearn/utils/estimator_checks.py | 57 ++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index ce17b6d8d..2385f91bc 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -28,6 +28,8 @@
 from imblearn.over_sampling.base import BaseOverSampler
 from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler
 from imblearn.ensemble.base import BaseEnsembleSampler
+from imblearn.over_sampling import SMOTE
+from imblearn.under_sampling import NearMiss, ClusterCentroids
 
 
 def _yield_sampler_checks(name, Estimator):
@@ -266,18 +268,33 @@ def check_samplers_sparse(name, Sampler):
                                n_informative=4, weights=[0.2, 0.3, 0.5],
                                random_state=0)
     X_sparse = sparse.csr_matrix(X)
-    sampler = Sampler(random_state=0)
-    X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y)
-    X_res, y_res = sampler.fit_sample(X, y)
-    if not isinstance(sampler, BaseEnsembleSampler):
-        assert_true(sparse.issparse(X_res_sparse))
-        assert_allclose(X_res_sparse.A, X_res, rtol=1e-05, atol=1e-05)
-        assert_allclose(y_res_sparse, y_res)
+    if isinstance(Sampler(), SMOTE):
+        samplers = [Sampler(random_state=0, kind=kind)
+                    for kind in ('regular', 'borderline1',
+                                 'borderline2', 'svm')]
+    elif isinstance(Sampler(), NearMiss):
+        samplers = [Sampler(random_state=0, version=version)
+                    for version in (1, 2, 3)]
     else:
-        for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res):
-            assert_true(sparse.issparse(x_sp))
-            assert_allclose(x_sp.A, x, rtol=1e-05, atol=1e-05)
-            assert_allclose(y_sp, y)
+        samplers = [Sampler(random_state=0)]
+    for sampler in samplers:
+        X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y)
+        X_res, y_res = sampler.fit_sample(X, y)
+        if not isinstance(sampler, BaseEnsembleSampler):
+            if not isinstance(sampler, ClusterCentroids):
+                assert_true(sparse.issparse(X_res_sparse))
+                assert_allclose(X_res_sparse.A, X_res)
+                assert_allclose(y_res_sparse, y_res)
+            else:
+                assert_true(sparse.issparse(X_res_sparse))
+                assert_allclose(X_res_sparse.A, X_res, rtol=1e-4, atol=1e-4)
+                assert_allclose(y_res_sparse, y_res)
+        else:
+            for x_sp, x, y_sp, y in zip(X_res_sparse, X_res,
+                                        y_res_sparse, y_res):
+                assert_true(sparse.issparse(x_sp))
+                assert_allclose(x_sp.A, x)
+                assert_allclose(y_sp, y)
 
 
 def check_samplers_pandas(name, Sampler):
@@ -289,10 +306,20 @@ def check_samplers_pandas(name, Sampler):
         import pandas as pd
         X_pd, y_pd = pd.DataFrame(X), pd.Series(y)
         sampler = Sampler(random_state=0)
-        X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd)
-        X_res, y_res = sampler.fit_sample(X, y)
-        assert_allclose(X_res_pd, X_res)
-        assert_allclose(y_res_pd, y_res)
+        if isinstance(Sampler(), SMOTE):
+            samplers = [Sampler(random_state=0, kind=kind)
+                        for kind in ('regular', 'borderline1',
+                                     'borderline2', 'svm')]
+        elif isinstance(Sampler(), NearMiss):
+            samplers = [Sampler(random_state=0, version=version)
+                        for version in (1, 2, 3)]
+        else:
+            samplers = [Sampler(random_state=0)]
+        for sampler in samplers:
+            X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd)
+            X_res, y_res = sampler.fit_sample(X, y)
+            assert_allclose(X_res_pd, X_res)
+            assert_allclose(y_res_pd, y_res)
 
     except ImportError:
             raise SkipTest("pandas is not installed: not testing for "

From d9c4e555939a778ae7fb8d8c5ee937ae53b3a139 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 17:48:30 +0200
Subject: [PATCH 23/28] TST set the algorithm of KMeans

---
 imblearn/utils/estimator_checks.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 2385f91bc..9fe958bf2 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -15,6 +15,7 @@
 from scipy import sparse
 
 from sklearn.datasets import make_classification
+from sklearn.cluster import KMeans
 from sklearn.utils.estimator_checks import _yield_all_checks \
     as sklearn_yield_all_checks, check_estimator \
     as sklearn_check_estimator, check_parameters_default_constructible
@@ -275,6 +276,11 @@ def check_samplers_sparse(name, Sampler):
     elif isinstance(Sampler(), NearMiss):
         samplers = [Sampler(random_state=0, version=version)
                     for version in (1, 2, 3)]
+    elif isinstance(Sampler(), ClusterCentroids):
+        # set KMeans to full since it support sparse and dense
+        samplers = [Sampler(random_state=0,
+                            estimator=KMeans(random_state=1,
+                                             algorithm='full'))]
     else:
         samplers = [Sampler(random_state=0)]
     for sampler in samplers:

From b4697472dedb77188f661619b994d93f76f3ccc2 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 18:14:37 +0200
Subject: [PATCH 24/28] DOC add entry in user guide

---
 doc/introduction.rst      | 51 +++++++++++++++++++++++++++++++++++++++
 doc/problem_statement.rst | 20 ---------------
 doc/user_guide.rst        |  2 +-
 3 files changed, 52 insertions(+), 21 deletions(-)
 create mode 100644 doc/introduction.rst
 delete mode 100644 doc/problem_statement.rst

diff --git a/doc/introduction.rst b/doc/introduction.rst
new file mode 100644
index 000000000..3261c6321
--- /dev/null
+++ b/doc/introduction.rst
@@ -0,0 +1,51 @@
+.. _introduction:
+
+============
+Introduction
+============
+
+.. _api_imblearn:
+
+API's of imbalanced-learn samplers
+----------------------------------
+
+The sampler available follows the scikit-learn API using the estimator base
+object with an addtionnal sample method:
+
+:Estimator:
+
+    The base object, implements a ``fit`` method to learn from data, either::
+
+      estimator = obj.fit(data, targets)
+
+:Sampler:
+
+    To resample a data sets, each sampler implements::
+
+      prediction = obj.predict(data, targets)
+
+Imbalanced-learn samplers accept the same inputs that in scikit-learn:
+
+* ``data``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse
+  matrices;
+* ``targets``: array-like (1-D list, pandas.Series, numpy.array).
+
+.. _problem_statement:
+
+Problem statement regarding imbalanced data sets
+------------------------------------------------
+
+The learning phase and the subsequent prediction of machine learning algorithms
+can be affected by the problem of imbalanced data set. The balancing issue
+corresponds to the difference of the number of samples in the different
+classes. We illustrate the effect of training a linear SVM classifier with
+different level of class balancing.
+
+.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png
+   :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
+   :scale: 60
+   :align: center
+
+As expected, the decision function of the linear SVM is highly impacted. With a
+greater imbalanced ratio, the decision function favor the class with the larger
+number of samples, usually referred as the majority class.
diff --git a/doc/problem_statement.rst b/doc/problem_statement.rst
deleted file mode 100644
index 7b1a87e88..000000000
--- a/doc/problem_statement.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. _problem_statement:
-
-=================
-Problem statement
-=================
-
-The learning phase and the subsequent prediction of machine learning algorithms
-can be affected by the problem of imbalanced data set. The balancing issue
-corresponds to the difference of the number of samples in the different
-classes. We illustrate the effect of training a linear SVM classifier with
-different level of class balancing.
-
-.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png
-   :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
-   :scale: 60
-   :align: center
-
-As expected, the decision function of the linear SVM is highly impacted. With a
-greater imbalanced ratio, the decision function favor the class with the larger
-number of samples, usually referred as the majority class.
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
index 88c5f8f92..7077f313f 100644
--- a/doc/user_guide.rst
+++ b/doc/user_guide.rst
@@ -9,7 +9,7 @@ User Guide
 .. toctree::
    :numbered:
 
-   problem_statement.rst
+   introduction.rst
    over_sampling.rst
    under_sampling.rst
    combine.rst

From c05d0ba102c0ce0d52e8a4aefd28d48eff7d359f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 18:30:41 +0200
Subject: [PATCH 25/28] DOC add entry sparse for CC

---
 doc/introduction.rst   | 7 +++++++
 doc/under_sampling.rst | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/doc/introduction.rst b/doc/introduction.rst
index 3261c6321..9bf1a6e0b 100644
--- a/doc/introduction.rst
+++ b/doc/introduction.rst
@@ -30,6 +30,13 @@ Imbalanced-learn samplers accept the same inputs that in scikit-learn:
   matrices;
 * ``targets``: array-like (1-D list, pandas.Series, numpy.array).
 
+.. topic:: Sparse input
+
+   For sparse input the data is **converted to the Compressed Sparse Rows
+   representation** (see ``scipy.sparse.csr_matrix``) before being fed to the
+   sampler. To avoid unnecessary memory copies, it is recommended to choose the
+   CSR representation upstream.
+
 .. _problem_statement:
 
 Problem statement regarding imbalanced data sets
diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
index cc292471c..78847122f 100644
--- a/doc/under_sampling.rst
+++ b/doc/under_sampling.rst
@@ -49,6 +49,12 @@ your data are grouped into clusters. In addition, the number of centroids
 should be set such that the under-sampled clusters are representative of the
 original one.
 
+.. warning::
+
+   :class:`ClusterCentroids` supports sparse matrices. However, the new samples
+   are generated are not specifically sparse. Therefore, even if the resulting
+   matrix will be sparse, the algorithm will be inefficient in this regard.
+
 See :ref:`sphx_glr_auto_examples_under-sampling_plot_cluster_centroids.py` and
 :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`.
 

From 1625879d6dc3ded60ed2f02c17f1f0f9a4519bf6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 18:33:04 +0200
Subject: [PATCH 26/28] DOC whatsnew entry

---
 doc/whats_new.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index b5f61c85b..54f6d441f 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -44,6 +44,9 @@ New features
 Enhancement
 ~~~~~~~~~~~
 
+- All samplers accepts sparse matrices with defaulting on CSR type. By
+  `Guillaume Lemaitre`_.
+
 - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It
   supports multiclass. By `Guillaume Lemaitre`_.
 

From 72a605d2b902bc8a085a751412b95c3af76e4e06 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 20:03:25 +0200
Subject: [PATCH 27/28] EHN add voting paramter for ClusterCentroids

---
 doc/under_sampling.rst                        | 20 +++--
 doc/whats_new.rst                             |  4 +
 .../under-sampling/plot_cluster_centroids.py  | 40 +++++++---
 .../prototype_generation/cluster_centroids.py | 70 +++++++++++++----
 .../tests/test_cluster_centroids.py           | 76 ++++++++++++++-----
 5 files changed, 163 insertions(+), 47 deletions(-)

diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
index 78847122f..7e1d69e7f 100644
--- a/doc/under_sampling.rst
+++ b/doc/under_sampling.rst
@@ -49,11 +49,21 @@ your data are grouped into clusters. In addition, the number of centroids
 should be set such that the under-sampled clusters are representative of the
 original one.
 
-.. warning::
-
-   :class:`ClusterCentroids` supports sparse matrices. However, the new samples
-   are generated are not specifically sparse. Therefore, even if the resulting
-   matrix will be sparse, the algorithm will be inefficient in this regard.
+:class:`ClusterCentroids` accepts sparse matrices. However, it is recommended
+to set ``voting`` to not set ``'soft'`` since the centroids found by the
+clustering method will be used. Those centroids are not enforce to be sparse
+and thus the output will not be memory efficient. Note that by default
+``voting`` is set to ``'auto'`` which will automatically chose a ``'hard'``
+voting instead of ``'soft'`` voting in the case of a sparse input.
+
+The effect of the ``voting`` parameter is illustrated in the figure below. When
+``voting`` is set to ``'hard'`` the nearest-neighbor of the centroids are used
+instead of the centroids itself when using ``'soft'`` voting.
+
+.. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_cluster_centroids_001.png
+   :target: ./auto_examples/under-sampling/plot_cluster_centroids.html
+   :scale: 60
+   :align: center
 
 See :ref:`sphx_glr_auto_examples_under-sampling_plot_cluster_centroids.py` and
 :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 54f6d441f..e24cf9fdf 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -35,6 +35,10 @@ Bug fixes
 New features
 ~~~~~~~~~~~~
 
+- :class:`under_sampling.ClusterCentroids` accepts a parameter ``voting``
+  allowing to use nearest-neighbors of centroids instead of centroids
+  themselves. It is more efficient for sparse input. By `Guillaume Lemaitre`_.
+
 - Turn off steps in :class:`pipeline.Pipeline` using the `None`
   object. By `Christos Aridas`_.
 
diff --git a/examples/under-sampling/plot_cluster_centroids.py b/examples/under-sampling/plot_cluster_centroids.py
index d6f7eaf25..d13b669e3 100644
--- a/examples/under-sampling/plot_cluster_centroids.py
+++ b/examples/under-sampling/plot_cluster_centroids.py
@@ -24,7 +24,7 @@
 X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                            n_informative=3, n_redundant=1, flip_y=0,
                            n_features=20, n_clusters_per_class=1,
-                           n_samples=200, random_state=10)
+                           n_samples=50, random_state=10)
 
 # Instanciate a PCA object for the sake of easy visualisation
 pca = PCA(n_components=2)
@@ -34,10 +34,15 @@
 # Apply Cluster Centroids
 cc = ClusterCentroids()
 X_resampled, y_resampled = cc.fit_sample(X, y)
-X_res_vis = pca.transform(X_resampled)
+X_res_vis_soft = pca.transform(X_resampled)
+
+# Use hard voting instead of soft voting
+cc = ClusterCentroids(voting='hard')
+X_resampled, y_resampled = cc.fit_sample(X, y)
+X_res_vis_hard = pca.transform(X_resampled)
 
 # Two subplots, unpack the axes array immediately
-f, (ax1, ax2) = plt.subplots(1, 2)
+f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
 
 c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                  alpha=0.5)
@@ -45,14 +50,30 @@
                  alpha=0.5)
 ax1.set_title('Original set')
 
-ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
+ax2.scatter(X_res_vis_soft[y_resampled == 0, 0],
+            X_res_vis_soft[y_resampled == 0, 1],
+            label="Class #0", alpha=.5)
+ax2.scatter(X_res_vis_soft[y_resampled == 1, 0],
+            X_res_vis_soft[y_resampled == 1, 1],
+            label="Class #1", alpha=.5)
+c2 = ax2.scatter(X_vis[y == 1, 0],
+                 X_vis[y == 1, 1], label="Original #1",
+                 alpha=0.2)
+ax2.set_title('Cluster centroids with soft voting')
+
+ax3.scatter(X_res_vis_hard[y_resampled == 0, 0],
+            X_res_vis_hard[y_resampled == 0, 1],
             label="Class #0", alpha=.5)
-ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
+ax3.scatter(X_res_vis_hard[y_resampled == 1, 0],
+            X_res_vis_hard[y_resampled == 1, 1],
             label="Class #1", alpha=.5)
-ax2.set_title('Cluster centroids')
+ax3.scatter(X_vis[y == 1, 0],
+            X_vis[y == 1, 1],
+            alpha=0.2)
+ax3.set_title('Cluster centroids with hard voting')
 
 # make nice plotting
-for ax in (ax1, ax2):
+for ax in (ax1, ax2, ax3):
     ax.spines['top'].set_visible(False)
     ax.spines['right'].set_visible(False)
     ax.get_xaxis().tick_bottom()
@@ -62,7 +83,8 @@
     ax.set_xlim([-6, 8])
     ax.set_ylim([-6, 6])
 
-plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center',
-              ncol=2, labelspacing=0.)
+plt.figlegend((c0, c1), ('Class #0', 'Class #1', 'Original Class #1'),
+              loc='lower center',
+              ncol=3, labelspacing=0.)
 plt.tight_layout(pad=3)
 plt.show()
diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
index 0cfebb193..aef3fdbe1 100644
--- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py
+++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
@@ -12,10 +12,13 @@
 from scipy import sparse
 
 from sklearn.cluster import KMeans
+from sklearn.neighbors import NearestNeighbors
 from sklearn.utils import safe_indexing
 
 from ..base import BaseUnderSampler
 
+VOTING_KIND = ('auto', 'hard', 'soft')
+
 
 class ClusterCentroids(BaseUnderSampler):
     """Perform under-sampling by generating centroids based on
@@ -58,6 +61,18 @@ class ClusterCentroids(BaseUnderSampler):
     estimator : object, optional(default=KMeans())
         Pass a :class:`sklearn.cluster.KMeans` estimator.
 
+    voting : str, optional (default='auto')
+        Voting strategy to generate the new samples:
+
+        - If ``'hard'``, the nearest-neighbors of the centroids found using the
+          clustering algorithm will be used.
+        - If ``'soft'``, the centroids found by the clustering algorithm will
+          be used.
+        - If ``'auto'``, if the input is sparse, it will default on ``'hard'``
+          otherwise, ``'soft'`` will be used.
+
+        .. versionadded:: 0.3.0
+
     n_jobs : int, optional (default=1)
         The number of threads to open if possible.
 
@@ -91,10 +106,12 @@ def __init__(self,
                  ratio='auto',
                  random_state=None,
                  estimator=None,
+                 voting='auto',
                  n_jobs=1):
         super(ClusterCentroids, self).__init__(
             ratio=ratio, random_state=random_state)
         self.estimator = estimator
+        self.voting = voting
         self.n_jobs = n_jobs
 
     def _validate_estimator(self):
@@ -108,6 +125,22 @@ def _validate_estimator(self):
             raise ValueError('`estimator` has to be a KMeans clustering.'
                              ' Got {} instead.'.format(type(self.estimator)))
 
+    def _generate_sample(self, X, y, centroids, target_class):
+        if self.voting_ == 'hard':
+            nearest_neighbors = NearestNeighbors(n_neighbors=1)
+            nearest_neighbors.fit(X, y)
+            indices = nearest_neighbors.kneighbors(centroids,
+                                                   return_distance=False)
+            X_new = safe_indexing(X, np.squeeze(indices))
+        else:
+            if sparse.issparse(X):
+                X_new = sparse.csr_matrix(centroids)
+            else:
+                X_new = centroids
+        y_new = np.array([target_class] * centroids.shape[0])
+
+        return X_new, y_new
+
     def _sample(self, X, y):
         """Resample the dataset.
 
@@ -131,28 +164,37 @@ def _sample(self, X, y):
         """
         self._validate_estimator()
 
-        idx_under = np.empty((0, ), dtype=int)
-        centroids, y_resampled = [], []
+        if self.voting == 'auto':
+            if sparse.issparse(X):
+                self.voting_ = 'hard'
+            else:
+                self.voting_ = 'soft'
+        else:
+            if self.voting in VOTING_KIND:
+                self.voting_ = self.voting
+            else:
+                raise ValueError("'voting' needs to be one of {}. Got {}"
+                                 " instead.".format(VOTING_KIND, self.voting))
+
+        X_resampled, y_resampled = [], []
         for target_class in np.unique(y):
             if target_class in self.ratio_.keys():
                 n_samples = self.ratio_[target_class]
                 self.estimator_.set_params(**{'n_clusters': n_samples})
                 self.estimator_.fit(X[y == target_class])
-                centroids.append(self.estimator_.cluster_centers_)
-                y_resampled += [target_class] * n_samples
-
+                X_new, y_new = self._generate_sample(
+                    X, y, self.estimator_.cluster_centers_, target_class)
+                X_resampled.append(X_new)
+                y_resampled.append(y_new)
             else:
                 target_class_indices = np.flatnonzero(y == target_class)
-                idx_under = np.concatenate(
-                    (idx_under, target_class_indices), axis=0)
-
-        X_resampled = np.concatenate((centroids))
+                X_resampled.append(safe_indexing(X, target_class_indices))
+                y_resampled.append(safe_indexing(y, target_class_indices))
 
         if sparse.issparse(X):
-            X_resampled = sparse.vstack([sparse.csr_matrix(X_resampled),
-                                         safe_indexing(X, idx_under)])
+            X_resampled = sparse.vstack(X_resampled)
         else:
-            X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under)))
-        y_resampled = np.hstack((y_resampled, safe_indexing(y, idx_under)))
+            X_resampled = np.vstack(X_resampled)
+        y_resampled = np.hstack(y_resampled)
 
-        return X_resampled, np.array(y_resampled)
+        return X_resampled, y_resampled
diff --git a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py
index d94a8070b..7a501e003 100644
--- a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py
+++ b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py
@@ -4,8 +4,10 @@
 from collections import Counter
 
 import numpy as np
+from scipy import sparse
 from sklearn.utils.testing import (assert_allclose, assert_array_equal,
-                                   assert_equal, assert_raises_regex)
+                                   assert_equal, assert_raises_regex,
+                                   assert_true)
 from sklearn.cluster import KMeans
 
 from imblearn.under_sampling import ClusterCentroids
@@ -20,17 +22,26 @@
 R_TOL = 1e-4
 
 
+def test_fit_sample_check_voting():
+    cc = ClusterCentroids(random_state=RND_SEED)
+    cc.fit_sample(X, Y)
+    assert_equal(cc.voting_, 'soft')
+    cc = ClusterCentroids(random_state=RND_SEED)
+    cc.fit_sample(sparse.csr_matrix(X), Y)
+    assert_equal(cc.voting_, 'hard')
+
+
 def test_fit_sample_auto():
     ratio = 'auto'
     cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
     X_resampled, y_resampled = cc.fit_sample(X, Y)
-    X_gt = np.array([[0.06738818, -0.529627],
-                     [0.17901516, 0.69860992],
-                     [0.094035, -2.55298982],
-                     [0.92923648, 0.76103773],
+    X_gt = np.array([[0.92923648, 0.76103773],
                      [0.47104475, 0.44386323],
-                     [0.13347175, 0.12167502]])
-    y_gt = np.array([1, 1, 1, 0, 0, 0])
+                     [0.13347175, 0.12167502],
+                     [0.06738818, -0.529627],
+                     [0.17901516, 0.69860992],
+                     [0.094035, -2.55298982]])
+    y_gt = np.array([0, 0, 0, 1, 1, 1])
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
 
@@ -39,16 +50,16 @@ def test_fit_sample_half():
     ratio = .5
     cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
     X_resampled, y_resampled = cc.fit_sample(X, Y)
-    X_gt = np.array([[0.09125309, -0.85409574],
+    X_gt = np.array([[0.92923648, 0.76103773],
+                     [0.47104475, 0.44386323],
+                     [0.13347175, 0.12167502],
+                     [0.09125309, -0.85409574],
                      [0.19220316, 0.32337101],
                      [0.094035, -2.55298982],
                      [0.20792588, 1.49407907],
                      [0.04352327, -0.20515826],
-                     [0.12372842, 0.6536186],
-                     [0.92923648, 0.76103773],
-                     [0.47104475, 0.44386323],
-                     [0.13347175, 0.12167502]])
-    y_gt = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0])
+                     [0.12372842, 0.6536186]])
+    y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
 
@@ -72,21 +83,48 @@ def test_fit_sample_object():
         ratio=ratio, random_state=RND_SEED, estimator=cluster)
 
     X_resampled, y_resampled = cc.fit_sample(X, Y)
-    X_gt = np.array([[0.06738818, -0.529627],
+    X_gt = np.array([[0.92923648, 0.76103773],
+                     [0.47104475, 0.44386323],
+                     [0.13347175, 0.12167502],
+                     [0.06738818, -0.529627],
                      [0.17901516, 0.69860992],
-                     [0.094035, -2.55298982],
-                     [0.92923648, 0.76103773],
+                     [0.094035, -2.55298982]])
+    y_gt = np.array([0, 0, 0, 1, 1, 1])
+    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
+    assert_array_equal(y_resampled, y_gt)
+
+
+def test_fit_hard_voting():
+    ratio = 'auto'
+    voting = 'hard'
+    cluster = KMeans(random_state=RND_SEED)
+    cc = ClusterCentroids(
+        ratio=ratio, random_state=RND_SEED, estimator=cluster,
+        voting=voting)
+
+    X_resampled, y_resampled = cc.fit_sample(X, Y)
+    X_gt = np.array([[0.92923648, 0.76103773],
                      [0.47104475, 0.44386323],
-                     [0.13347175, 0.12167502]])
-    y_gt = np.array([1, 1, 1, 0, 0, 0])
+                     [0.13347175, 0.12167502],
+                     [0.09125309, -0.85409574],
+                     [0.12372842, 0.6536186],
+                     [0.094035, -2.55298982]])
+    y_gt = np.array([0, 0, 0, 1, 1, 1])
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
+    for x in X_resampled:
+        assert_true(np.any(np.all(x == X, axis=1)))
 
 
-def test_fit_sample_wrong_object():
+def test_fit_sample_error():
     ratio = 'auto'
     cluster = 'rnd'
     cc = ClusterCentroids(
         ratio=ratio, random_state=RND_SEED, estimator=cluster)
     assert_raises_regex(ValueError, "has to be a KMeans clustering",
                         cc.fit_sample, X, Y)
+
+    voting = 'unknown'
+    cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED)
+    assert_raises_regex(ValueError, "needs to be one of",
+                        cc.fit_sample, X, Y)

From e1ffb13a5ec6232dcbe743a7e1be985cf943aff9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 14 Aug 2017 20:45:02 +0200
Subject: [PATCH 28/28] TST fix common test fixing voting

---
 imblearn/utils/estimator_checks.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 9fe958bf2..7a851cc06 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -279,6 +279,7 @@ def check_samplers_sparse(name, Sampler):
     elif isinstance(Sampler(), ClusterCentroids):
         # set KMeans to full since it support sparse and dense
         samplers = [Sampler(random_state=0,
+                            voting='soft',
                             estimator=KMeans(random_state=1,
                                              algorithm='full'))]
     else:
@@ -287,14 +288,9 @@ def check_samplers_sparse(name, Sampler):
         X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y)
         X_res, y_res = sampler.fit_sample(X, y)
         if not isinstance(sampler, BaseEnsembleSampler):
-            if not isinstance(sampler, ClusterCentroids):
                 assert_true(sparse.issparse(X_res_sparse))
                 assert_allclose(X_res_sparse.A, X_res)
                 assert_allclose(y_res_sparse, y_res)
-            else:
-                assert_true(sparse.issparse(X_res_sparse))
-                assert_allclose(X_res_sparse.A, X_res, rtol=1e-4, atol=1e-4)
-                assert_allclose(y_res_sparse, y_res)
         else:
             for x_sp, x, y_sp, y in zip(X_res_sparse, X_res,
                                         y_res_sparse, y_res):