From 46f2cb58a56fa0ad51d08ec7df011fab51924581 Mon Sep 17 00:00:00 2001
From: Matt Eding <matteding@gmail.com>
Date: Mon, 18 Nov 2019 10:35:55 -0800
Subject: [PATCH 1/5] vectorized adasyn; fixed adasyn module docstring; todo:
 update unit tests due to random state changes

---
 imblearn/over_sampling/_adasyn.py | 93 +++++++++----------------------
 imblearn/over_sampling/_smote.py  |  5 +-
 2 files changed, 29 insertions(+), 69 deletions(-)

diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
index 85f314c78..fbbda867f 100644
--- a/imblearn/over_sampling/_adasyn.py
+++ b/imblearn/over_sampling/_adasyn.py
@@ -1,4 +1,4 @@
-﻿"""Class to perform random over-sampling."""
+﻿"""Class to perform over-sampling using ADASYN."""
 
 # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
 #          Christos Aridas
@@ -100,12 +100,12 @@ def _validate_estimator(self):
         )
         self.nn_.set_params(**{"n_jobs": self.n_jobs})
 
-    def _fit_resample(self, X, y):
+def _fit_resample(self, X, y):
         self._validate_estimator()
         random_state = check_random_state(self.random_state)
 
-        X_resampled = X.copy()
-        y_resampled = y.copy()
+        X_resampled = [X.copy()]
+        y_resampled = [y.copy()]
 
         for class_sample, n_samples in self.sampling_strategy_.items():
             if n_samples == 0:
@@ -114,13 +114,11 @@ def _fit_resample(self, X, y):
             X_class = _safe_indexing(X, target_class_indices)
 
             self.nn_.fit(X)
-            _, nn_index = self.nn_.kneighbors(X_class)
+            nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
             # The ratio is computed using a one-vs-rest manner. Using majority
             # in multi-class would lead to slightly different results at the
             # cost of introducing a new parameter.
-            ratio_nn = np.sum(y[nn_index[:, 1:]] != class_sample, axis=1) / (
-                self.nn_.n_neighbors - 1
-            )
+            ratio_nn = np.sum(y[nns] != class_sample, axis=1) / self.n_neighbors
             if not np.sum(ratio_nn):
                 raise RuntimeError(
                     "Not any neigbours belong to the majority"
@@ -140,66 +138,29 @@ def _fit_resample(self, X, y):
             # the nearest neighbors need to be fitted only on the current class
             # to find the class NN to generate new samples
             self.nn_.fit(X_class)
-            _, nn_index = self.nn_.kneighbors(X_class)
+            nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
 
-            if sparse.issparse(X):
-                row_indices, col_indices, samples = [], [], []
-                n_samples_generated = 0
-                for x_i, x_i_nn, num_sample_i in zip(
-                    X_class, nn_index, n_samples_generate
-                ):
-                    if num_sample_i == 0:
-                        continue
-                    nn_zs = random_state.randint(
-                        1, high=self.nn_.n_neighbors, size=num_sample_i
-                    )
-                    steps = random_state.uniform(size=len(nn_zs))
-                    if x_i.nnz:
-                        for step, nn_z in zip(steps, nn_zs):
-                            sample = x_i + step * (
-                                X_class[x_i_nn[nn_z], :] - x_i
-                            )
-                            row_indices += [n_samples_generated] * len(
-                                sample.indices
-                            )
-                            col_indices += sample.indices.tolist()
-                            samples += sample.data.tolist()
-                            n_samples_generated += 1
-                X_new = sparse.csr_matrix(
-                    (samples, (row_indices, col_indices)),
-                    [np.sum(n_samples_generate), X.shape[1]],
-                    dtype=X.dtype,
-                )
-                y_new = np.array(
-                    [class_sample] * np.sum(n_samples_generate), dtype=y.dtype
-                )
-            else:
-                x_class_gen = []
-                for x_i, x_i_nn, num_sample_i in zip(
-                    X_class, nn_index, n_samples_generate
-                ):
-                    if num_sample_i == 0:
-                        continue
-                    nn_zs = random_state.randint(
-                        1, high=self.nn_.n_neighbors, size=num_sample_i
-                    )
-                    steps = random_state.uniform(size=len(nn_zs))
-                    x_class_gen.append(
-                        [
-                            x_i + step * (X_class[x_i_nn[nn_z], :] - x_i)
-                            for step, nn_z in zip(steps, nn_zs)
-                        ]
-                    )
-
-                X_new = np.concatenate(x_class_gen).astype(X.dtype)
-                y_new = np.array(
-                    [class_sample] * np.sum(n_samples_generate), dtype=y.dtype
-                )
+            rows = np.repeat(target_class_indices, n_samples_generate)
+            cols = random_state.choice(self.n_neighbors, size=n_samples)
+            diffs = X_class[nns[rows, cols]] - X_class[rows]
+            steps = random_state.uniform(size=(n_samples, 1))
 
-            if sparse.issparse(X_new):
-                X_resampled = sparse.vstack([X_resampled, X_new])
+            if sparse.issparse(X):
+                sparse_func = type(X).__name__
+                steps = getattr(sparse, sparse_func)(steps)
+                X_new = X_class[rows] + steps.multiply(diffs)
             else:
-                X_resampled = np.vstack((X_resampled, X_new))
-            y_resampled = np.hstack((y_resampled, y_new))
+                X_new = X_class[rows] + steps * diffs
+
+            X_new = X_new.astype(X.dtype)
+            y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype)
+            X_resampled.append(X_new)
+            y_resampled.append(y_new)
+
+        if sparse.issparse(X):
+            X_resampled = sparse.vstack(X_resampled, format=X.format)
+        else:
+            X_resampled = np.vstack(X_resampled)
+        y_resampled = np.hstack(y_resampled)
 
         return X_resampled, y_resampled
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
index b764da6b6..9ff9d2771 100644
--- a/imblearn/over_sampling/_smote.py
+++ b/imblearn/over_sampling/_smote.py
@@ -98,7 +98,7 @@ def _make_samples(
         """
         random_state = check_random_state(self.random_state)
         samples_indices = random_state.randint(
-            low=0, high=len(nn_num.flatten()), size=n_samples
+            low=0, high=len(nn_num.ravel()), size=n_samples
         )
 
         # np.newaxis for backwards compatability with random_state
@@ -731,13 +731,12 @@ def _fit_resample(self, X, y):
             X_resampled.append(X_new)
             y_resampled.append(y_new)
 
-        if sparse.issparse(X_new):
+        if sparse.issparse(X):
             X_resampled = sparse.vstack(X_resampled, format=X.format)
         else:
             X_resampled = np.vstack(X_resampled)
         y_resampled = np.hstack(y_resampled)
 
-
         return X_resampled, y_resampled
 
 

From e0b5c94fab9e6f781a67f328ebe401ea3b938f76 Mon Sep 17 00:00:00 2001
From: Matt Eding <matteding@gmail.com>
Date: Mon, 18 Nov 2019 10:44:47 -0800
Subject: [PATCH 2/5] fix indentation error

---
 imblearn/over_sampling/_adasyn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
index fbbda867f..c8f517791 100644
--- a/imblearn/over_sampling/_adasyn.py
+++ b/imblearn/over_sampling/_adasyn.py
@@ -100,7 +100,7 @@ def _validate_estimator(self):
         )
         self.nn_.set_params(**{"n_jobs": self.n_jobs})
 
-def _fit_resample(self, X, y):
+    def _fit_resample(self, X, y):
         self._validate_estimator()
         random_state = check_random_state(self.random_state)
 

From 07175eff03a557d706d893617c0ee0a7aa562eb2 Mon Sep 17 00:00:00 2001
From: Matt Eding <matteding@gmail.com>
Date: Mon, 18 Nov 2019 15:44:14 -0800
Subject: [PATCH 3/5] fixed row selection indices; fixed n_samples to work with
 non-ints

---
 imblearn/over_sampling/_adasyn.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
index c8f517791..08b8bf377 100644
--- a/imblearn/over_sampling/_adasyn.py
+++ b/imblearn/over_sampling/_adasyn.py
@@ -118,7 +118,8 @@ def _fit_resample(self, X, y):
             # The ratio is computed using a one-vs-rest manner. Using majority
             # in multi-class would lead to slightly different results at the
             # cost of introducing a new parameter.
-            ratio_nn = np.sum(y[nns] != class_sample, axis=1) / self.n_neighbors
+            n_neighbors = self.nn_.n_neighbors - 1
+            ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors
             if not np.sum(ratio_nn):
                 raise RuntimeError(
                     "Not any neigbours belong to the majority"
@@ -140,8 +141,9 @@ def _fit_resample(self, X, y):
             self.nn_.fit(X_class)
             nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
 
-            rows = np.repeat(target_class_indices, n_samples_generate)
-            cols = random_state.choice(self.n_neighbors, size=n_samples)
+            enumerated_class_indices = np.arange(len(target_class_indices))
+            rows = np.repeat(enumerated_class_indices, n_samples_generate)
+            cols = random_state.choice(n_neighbors, size=n_samples)
             diffs = X_class[nns[rows, cols]] - X_class[rows]
             steps = random_state.uniform(size=(n_samples, 1))
 

From 85c173d5e8c65cff1522838186b27389406fa491 Mon Sep 17 00:00:00 2001
From: Matt Eding <matteding@gmail.com>
Date: Mon, 18 Nov 2019 16:48:02 -0800
Subject: [PATCH 4/5] fixed row & col shape occassional mismatch due to
 rounding in algorithm

---
 imblearn/over_sampling/_adasyn.py | 4 +++-
 imblearn/over_sampling/_smote.py  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
index 08b8bf377..f014243e7 100644
--- a/imblearn/over_sampling/_adasyn.py
+++ b/imblearn/over_sampling/_adasyn.py
@@ -130,7 +130,9 @@ def _fit_resample(self, X, y):
                 )
             ratio_nn /= np.sum(ratio_nn)
             n_samples_generate = np.rint(ratio_nn * n_samples).astype(int)
-            if not np.sum(n_samples_generate):
+            # rounding may cause new amount for n_samples
+            n_samples = np.sum(n_samples_generate)
+            if not n_samples:
                 raise ValueError(
                     "No samples will be generated with the"
                     " provided ratio settings."
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
index 9ff9d2771..cea14cfd2 100644
--- a/imblearn/over_sampling/_smote.py
+++ b/imblearn/over_sampling/_smote.py
@@ -98,7 +98,7 @@ def _make_samples(
         """
         random_state = check_random_state(self.random_state)
         samples_indices = random_state.randint(
-            low=0, high=len(nn_num.ravel()), size=n_samples
+            low=0, high=nn_num.size, size=n_samples
         )
 
         # np.newaxis for backwards compatability with random_state

From ad574cea9ea0542ea25e91ba5ffd5314297df540 Mon Sep 17 00:00:00 2001
From: Matt Eding <matteding@gmail.com>
Date: Thu, 21 Nov 2019 23:01:42 -0800
Subject: [PATCH 5/5] update unit tests to reflect random state changes

---
 imblearn/over_sampling/tests/test_adasyn.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py
index f7fcb07c7..87769f08e 100644
--- a/imblearn/over_sampling/tests/test_adasyn.py
+++ b/imblearn/over_sampling/tests/test_adasyn.py
@@ -72,10 +72,10 @@ def test_ada_fit_resample():
             [-0.41635887, -0.38299653],
             [0.08711622, 0.93259929],
             [1.70580611, -0.11219234],
-            [0.94899098, -0.30508981],
-            [0.28204936, -0.13953426],
-            [1.58028868, -0.04089947],
-            [0.66117333, -0.28009063],
+            [0.88161986, -0.2829741],
+            [0.35681689, -0.18814597],
+            [1.4148276, 0.05308106],
+            [0.3136591, -0.31327875],
         ]
     )
     y_gt = np.array(
@@ -136,10 +136,10 @@ def test_ada_fit_resample_nn_obj():
             [-0.41635887, -0.38299653],
             [0.08711622, 0.93259929],
             [1.70580611, -0.11219234],
-            [0.94899098, -0.30508981],
-            [0.28204936, -0.13953426],
-            [1.58028868, -0.04089947],
-            [0.66117333, -0.28009063],
+            [0.88161986, -0.2829741],
+            [0.35681689, -0.18814597],
+            [1.4148276, 0.05308106],
+            [0.3136591, -0.31327875],
         ]
     )
     y_gt = np.array(