From 46f2cb58a56fa0ad51d08ec7df011fab51924581 Mon Sep 17 00:00:00 2001 From: Matt Eding Date: Mon, 18 Nov 2019 10:35:55 -0800 Subject: [PATCH 1/5] vectorized adasyn; fixed adasyn module docstring; todo: update unit tests due to random state changes --- imblearn/over_sampling/_adasyn.py | 93 +++++++++---------------------- imblearn/over_sampling/_smote.py | 5 +- 2 files changed, 29 insertions(+), 69 deletions(-) diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 85f314c78..fbbda867f 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -1,4 +1,4 @@ -"""Class to perform random over-sampling.""" +"""Class to perform over-sampling using ADASYN.""" # Authors: Guillaume Lemaitre # Christos Aridas @@ -100,12 +100,12 @@ def _validate_estimator(self): ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) - def _fit_resample(self, X, y): +def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) - X_resampled = X.copy() - y_resampled = y.copy() + X_resampled = [X.copy()] + y_resampled = [y.copy()] for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: @@ -114,13 +114,11 @@ def _fit_resample(self, X, y): X_class = _safe_indexing(X, target_class_indices) self.nn_.fit(X) - _, nn_index = self.nn_.kneighbors(X_class) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] # The ratio is computed using a one-vs-rest manner. Using majority # in multi-class would lead to slightly different results at the # cost of introducing a new parameter. - ratio_nn = np.sum(y[nn_index[:, 1:]] != class_sample, axis=1) / ( - self.nn_.n_neighbors - 1 - ) + ratio_nn = np.sum(y[nns] != class_sample, axis=1) / self.n_neighbors if not np.sum(ratio_nn): raise RuntimeError( "Not any neigbours belong to the majority" @@ -140,66 +138,29 @@ def _fit_resample(self, X, y): # the nearest neighbors need to be fitted only on the current class # to find the class NN to generate new samples self.nn_.fit(X_class) - _, nn_index = self.nn_.kneighbors(X_class) + nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] - if sparse.issparse(X): - row_indices, col_indices, samples = [], [], [] - n_samples_generated = 0 - for x_i, x_i_nn, num_sample_i in zip( - X_class, nn_index, n_samples_generate - ): - if num_sample_i == 0: - continue - nn_zs = random_state.randint( - 1, high=self.nn_.n_neighbors, size=num_sample_i - ) - steps = random_state.uniform(size=len(nn_zs)) - if x_i.nnz: - for step, nn_z in zip(steps, nn_zs): - sample = x_i + step * ( - X_class[x_i_nn[nn_z], :] - x_i - ) - row_indices += [n_samples_generated] * len( - sample.indices - ) - col_indices += sample.indices.tolist() - samples += sample.data.tolist() - n_samples_generated += 1 - X_new = sparse.csr_matrix( - (samples, (row_indices, col_indices)), - [np.sum(n_samples_generate), X.shape[1]], - dtype=X.dtype, - ) - y_new = np.array( - [class_sample] * np.sum(n_samples_generate), dtype=y.dtype - ) - else: - x_class_gen = [] - for x_i, x_i_nn, num_sample_i in zip( - X_class, nn_index, n_samples_generate - ): - if num_sample_i == 0: - continue - nn_zs = random_state.randint( - 1, high=self.nn_.n_neighbors, size=num_sample_i - ) - steps = random_state.uniform(size=len(nn_zs)) - x_class_gen.append( - [ - x_i + step * (X_class[x_i_nn[nn_z], :] - x_i) - for step, nn_z in zip(steps, nn_zs) - ] - ) - - X_new = np.concatenate(x_class_gen).astype(X.dtype) - y_new = np.array( - [class_sample] * np.sum(n_samples_generate), dtype=y.dtype - ) + rows = np.repeat(target_class_indices, n_samples_generate) + cols = random_state.choice(self.n_neighbors, size=n_samples) + diffs = X_class[nns[rows, cols]] - X_class[rows] + steps = random_state.uniform(size=(n_samples, 1)) - if sparse.issparse(X_new): - X_resampled = sparse.vstack([X_resampled, X_new]) + if sparse.issparse(X): + sparse_func = type(X).__name__ + steps = getattr(sparse, sparse_func)(steps) + X_new = X_class[rows] + steps.multiply(diffs) else: - X_resampled = np.vstack((X_resampled, X_new)) - y_resampled = np.hstack((y_resampled, y_new)) + X_new = X_class[rows] + steps * diffs + + X_new = X_new.astype(X.dtype) + y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype) + X_resampled.append(X_new) + y_resampled.append(y_new) + + if sparse.issparse(X): + X_resampled = sparse.vstack(X_resampled, format=X.format) + else: + X_resampled = np.vstack(X_resampled) + y_resampled = np.hstack(y_resampled) return X_resampled, y_resampled diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index b764da6b6..9ff9d2771 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -98,7 +98,7 @@ def _make_samples( """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint( - low=0, high=len(nn_num.flatten()), size=n_samples + low=0, high=len(nn_num.ravel()), size=n_samples ) # np.newaxis for backwards compatability with random_state @@ -731,13 +731,12 @@ def _fit_resample(self, X, y): X_resampled.append(X_new) y_resampled.append(y_new) - if sparse.issparse(X_new): + if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled, format=X.format) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) - return X_resampled, y_resampled From e0b5c94fab9e6f781a67f328ebe401ea3b938f76 Mon Sep 17 00:00:00 2001 From: Matt Eding Date: Mon, 18 Nov 2019 10:44:47 -0800 Subject: [PATCH 2/5] fix indentation error --- imblearn/over_sampling/_adasyn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index fbbda867f..c8f517791 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -100,7 +100,7 @@ def _validate_estimator(self): ) self.nn_.set_params(**{"n_jobs": self.n_jobs}) -def _fit_resample(self, X, y): + def _fit_resample(self, X, y): self._validate_estimator() random_state = check_random_state(self.random_state) From 07175eff03a557d706d893617c0ee0a7aa562eb2 Mon Sep 17 00:00:00 2001 From: Matt Eding Date: Mon, 18 Nov 2019 15:44:14 -0800 Subject: [PATCH 3/5] fixed row selection indices; fixed n_samples to work with non-ints --- imblearn/over_sampling/_adasyn.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index c8f517791..08b8bf377 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -118,7 +118,8 @@ def _fit_resample(self, X, y): # The ratio is computed using a one-vs-rest manner. Using majority # in multi-class would lead to slightly different results at the # cost of introducing a new parameter. - ratio_nn = np.sum(y[nns] != class_sample, axis=1) / self.n_neighbors + n_neighbors = self.nn_.n_neighbors - 1 + ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors if not np.sum(ratio_nn): raise RuntimeError( "Not any neigbours belong to the majority" @@ -140,8 +141,9 @@ def _fit_resample(self, X, y): self.nn_.fit(X_class) nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:] - rows = np.repeat(target_class_indices, n_samples_generate) - cols = random_state.choice(self.n_neighbors, size=n_samples) + enumerated_class_indices = np.arange(len(target_class_indices)) + rows = np.repeat(enumerated_class_indices, n_samples_generate) + cols = random_state.choice(n_neighbors, size=n_samples) diffs = X_class[nns[rows, cols]] - X_class[rows] steps = random_state.uniform(size=(n_samples, 1)) From 85c173d5e8c65cff1522838186b27389406fa491 Mon Sep 17 00:00:00 2001 From: Matt Eding Date: Mon, 18 Nov 2019 16:48:02 -0800 Subject: [PATCH 4/5] fixed row & col shape occassional mismatch due to rounding in algorithm --- imblearn/over_sampling/_adasyn.py | 4 +++- imblearn/over_sampling/_smote.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 08b8bf377..f014243e7 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -130,7 +130,9 @@ def _fit_resample(self, X, y): ) ratio_nn /= np.sum(ratio_nn) n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) - if not np.sum(n_samples_generate): + # rounding may cause new amount for n_samples + n_samples = np.sum(n_samples_generate) + if not n_samples: raise ValueError( "No samples will be generated with the" " provided ratio settings." diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 9ff9d2771..cea14cfd2 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -98,7 +98,7 @@ def _make_samples( """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint( - low=0, high=len(nn_num.ravel()), size=n_samples + low=0, high=nn_num.size, size=n_samples ) # np.newaxis for backwards compatability with random_state From ad574cea9ea0542ea25e91ba5ffd5314297df540 Mon Sep 17 00:00:00 2001 From: Matt Eding Date: Thu, 21 Nov 2019 23:01:42 -0800 Subject: [PATCH 5/5] update unit tests to reflect random state changes --- imblearn/over_sampling/tests/test_adasyn.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index f7fcb07c7..87769f08e 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -72,10 +72,10 @@ def test_ada_fit_resample(): [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.94899098, -0.30508981], - [0.28204936, -0.13953426], - [1.58028868, -0.04089947], - [0.66117333, -0.28009063], + [0.88161986, -0.2829741], + [0.35681689, -0.18814597], + [1.4148276, 0.05308106], + [0.3136591, -0.31327875], ] ) y_gt = np.array( @@ -136,10 +136,10 @@ def test_ada_fit_resample_nn_obj(): [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.94899098, -0.30508981], - [0.28204936, -0.13953426], - [1.58028868, -0.04089947], - [0.66117333, -0.28009063], + [0.88161986, -0.2829741], + [0.35681689, -0.18814597], + [1.4148276, 0.05308106], + [0.3136591, -0.31327875], ] ) y_gt = np.array(