From dba11acfea92d8298a91e0312e459e90f81864da Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Thu, 21 Jun 2018 16:27:28 +0200 Subject: [PATCH 01/35] Initial K-Means SMOTE commit. --- imblearn/over_sampling/smote.py | 144 ++++++++++++++++++++- imblearn/over_sampling/tests/test_smote.py | 27 ++++ 2 files changed, 169 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 8e8b7965e..a8997c416 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -12,7 +12,9 @@ from scipy import sparse from sklearn.svm import SVC +from sklearn.cluster import KMeans from sklearn.utils import check_random_state, safe_indexing +from sklearn.metrics.pairwise import pairwise_distances from .base import BaseOverSampler from ..exceptions import raise_isinstance_error @@ -20,7 +22,7 @@ from ..utils import Substitution from ..utils._docstring import _random_state_docstring -SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm') +SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm', 'kmeans') @Substitution( @@ -59,12 +61,16 @@ class SMOTE(BaseOverSampler): kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: - ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``. + ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``, ``'kmeans'``. svm_estimator : object, optional (default=SVC()) If ``kind='svm'``, a parametrized :class:`sklearn.svm.SVC` classifier can be passed. + n_kmeans_clusters: int, optional (default=10) + If ``kind='kmeans'``, the number of clusters that is the be used by the + k-means algorithm for sample identification. + n_jobs : int, optional (default=1) The number of threads to open if possible. @@ -133,6 +139,7 @@ def __init__(self, out_step=0.5, kind='regular', svm_estimator=None, + n_kmeans_clusters=10, n_jobs=1, ratio=None): super(SMOTE, self).__init__( @@ -143,6 +150,7 @@ def __init__(self, self.m_neighbors = m_neighbors self.out_step = out_step self.svm_estimator = svm_estimator + self.n_kmeans_clusters = n_kmeans_clusters self.n_jobs = n_jobs def _in_danger_noise(self, samples, target_class, y, kind='danger'): @@ -537,6 +545,136 @@ def _sample_svm(self, X, y): return X_resampled, y_resampled + def _find_cluster_sparsity(self, X): + """ Finds the sparsity of a cluster of samples. The sparsity is + calculated according to the method described in [4]_. """ + + euclidean_distances = pairwise_distances( + X, metric="euclidean", n_jobs=self.n_jobs + ) + + # Negate diagonal elements. + for ind in range(X.shape[0]): + euclidean_distances[ind, ind] = 0 + + non_diag_elements = (len(X) ** 2) - len(X) + mean_distance = euclidean_distances.sum() / non_diag_elements + + density = len(X) / (mean_distance ** 2) + sparsity = 1 / density + return sparsity + + def _sample_kmeans(self, X, y): + """Resample the dataset using the SMOTE K-Means implementation. + + Use the SMOTE K-Means algorithm proposed in [4]_. K-Means clustering + is used to select samples for over sampling. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : array-like, shape (n_samples,) + Corresponding label for each sample in X. + + Returns + ------- + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : ndarray, shape (n_samples_new,) + The corresponding label of `X_resampled` + + References + ---------- + .. [4] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling + for imbalanced data classification," International Journal of + Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. + + """ + random_state = check_random_state(self.random_state) + X_resampled = X.copy() + y_resampled = y.copy() + + for class_sample, n_samples in self.sampling_strategy_.items(): + if n_samples == 0: + continue + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) + + km = KMeans( + self.n_kmeans_clusters, + random_state=self.random_state, + n_jobs=self.n_jobs + ) + X_clusters = km.fit_predict(X) + + valid_clusters = [] + cluster_sparsities = [] + + # Identify clusters where class_sample is the majority + for cluster_n in range(self.n_kmeans_clusters): + cluster_index = np.flatnonzero(X_clusters == cluster_n) + + X_cluster = safe_indexing(X, cluster_index) + y_cluster = safe_indexing(y, cluster_index) + + cluster_class_mean = (y_cluster == class_sample).mean() + + X_cluster_class = safe_indexing( + X_cluster, + np.flatnonzero(y_cluster == class_sample) + ) + + if len(X_cluster_class) < self.k_neighbors + 1: + continue + + if cluster_class_mean < 0.5: + continue + + valid_clusters.append(cluster_index) + cluster_sparsities.append( + self._find_cluster_sparsity(X_cluster_class) + ) + + cluster_weights = [ + cs / sum(cluster_sparsities) for cs in cluster_sparsities + ] + + for cluster_n in range(len(valid_clusters)): + X_cluster = safe_indexing(X, valid_clusters[cluster_n]) + y_cluster = safe_indexing(y, valid_clusters[cluster_n]) + + X_cluster_class = safe_indexing( + X_cluster, np.flatnonzero(y_cluster == class_sample) + ) + + self.nn_k_.fit(X_cluster_class) + + nns = self.nn_k_.kneighbors( + X_cluster_class, return_distance=False + )[:, 1:] + + c_n_samples = int(n_samples * cluster_weights[cluster_n]) + X_new, y_new = self._make_samples( + X_cluster_class, + class_sample, + X_class, + nns, + c_n_samples, + 1.0 + ) + + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) + + return X_resampled, y_resampled + def _sample(self, X, y): """Resample the dataset. @@ -566,3 +704,5 @@ def _sample(self, X, y): return self._sample_borderline(X, y) elif self.kind == 'svm': return self._sample_svm(X, y) + elif self.kind == 'kmeans': + return self._sample_kmeans(X, y) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 5346c39fd..55a5c5793 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -277,3 +277,30 @@ def test_sample_regular_wrong_svm(): with raises(ValueError, match="has to be one of"): smote.fit_sample(X, Y) + + +def test_sample_kmeans(): + kind = 'kmeans' + smote = SMOTE(random_state=RND_SEED, kind=kind, n_kmeans_clusters=3, k_neighbors=2) + X_resampled, y_resampled = smote.fit_sample(X, Y) + X_gt = np.array([ + [0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [1.25192108, - 0.22367336], [1.45849179, - 0.17293647], + [0.92581435, - 0.29748169], [1.25192108, - 0.22367336] + ]) + + y_gt = np.array([ + 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 + ]) + + assert_allclose(X_resampled, X_gt, rtol=R_TOL) + assert_array_equal(y_resampled, y_gt) \ No newline at end of file From d54ffc2c905c287ef446545c1724ddd47f93cab5 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Thu, 21 Jun 2018 16:54:38 +0200 Subject: [PATCH 02/35] PEP8, PyFlakes fixes, corrected paper reference. --- imblearn/over_sampling/smote.py | 11 ++++++----- imblearn/over_sampling/tests/test_smote.py | 13 +++++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index a8997c416..585987afd 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -61,7 +61,8 @@ class SMOTE(BaseOverSampler): kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: - ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``, ``'kmeans'``. + ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``, + ``'kmeans'``. svm_estimator : object, optional (default=SVC()) If ``kind='svm'``, a parametrized :class:`sklearn.svm.SVC` @@ -589,9 +590,9 @@ def _sample_kmeans(self, X, y): References ---------- - .. [4] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling - for imbalanced data classification," International Journal of - Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. + .. [4] Felix Last, Georgios Douzas, and Fernando Bacao, "Oversampling + for Imbalanced Learning based on K-Means and SMOTE", + https://arxiv.org/pdf/1711.00837.pdf """ random_state = check_random_state(self.random_state) @@ -606,7 +607,7 @@ def _sample_kmeans(self, X, y): km = KMeans( self.n_kmeans_clusters, - random_state=self.random_state, + random_state=random_state, n_jobs=self.n_jobs ) X_clusters = km.fit_predict(X) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 55a5c5793..e86aae2fd 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -281,7 +281,12 @@ def test_sample_regular_wrong_svm(): def test_sample_kmeans(): kind = 'kmeans' - smote = SMOTE(random_state=RND_SEED, kind=kind, n_kmeans_clusters=3, k_neighbors=2) + smote = SMOTE( + random_state=RND_SEED, + kind=kind, + n_kmeans_clusters=3, + k_neighbors=2 + ) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([ [0.11622591, -0.0317206], [0.77481731, 0.60935141], @@ -294,8 +299,8 @@ def test_sample_kmeans(): [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.25192108, - 0.22367336], [1.45849179, - 0.17293647], - [0.92581435, - 0.29748169], [1.25192108, - 0.22367336] + [0.98135505, 0.22510669], [0.80404479, -0.27321949], + [0.91314969, -0.376049], [0.82740979, -0.35957365] ]) y_gt = np.array([ @@ -303,4 +308,4 @@ def test_sample_kmeans(): ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) \ No newline at end of file + assert_array_equal(y_resampled, y_gt) From 4a9b990f5ca67d13f19a419342d75b1c006bb172 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Thu, 21 Jun 2018 17:20:01 +0200 Subject: [PATCH 03/35] Added examples. --- .../over-sampling/plot_comparison_over_sampling.py | 10 ++++++---- examples/over-sampling/plot_smote.py | 13 ++++++++----- imblearn/over_sampling/smote.py | 4 ++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index ba22f7bbd..e9f6b75fe 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -215,18 +215,20 @@ def fit_sample(self, X, y): # the support vectors found using an SVM algorithm to create new samples. fig, ((ax1, ax2), (ax3, ax4), - (ax5, ax6), (ax7, ax8)) = plt.subplots(4, 2, figsize=(15, 30)) + (ax5, ax6), (ax7, ax8), + (ax9, ax10)) = plt.subplots(5, 2, figsize=(15, 30)) X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8) -ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) -string_add = ['regular', 'borderline-1', 'borderline-2', 'SVM'] +ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) +string_add = ['regular', 'borderline-1', 'borderline-2', 'SVM', 'K-Means'] for str_add, ax, sampler in zip(string_add, ax_arr, (SMOTE(random_state=0), SMOTE(random_state=0, kind='borderline1'), SMOTE(random_state=0, kind='borderline2'), - SMOTE(random_state=0, kind='svm'))): + SMOTE(random_state=0, kind='svm'), + SMOTE(random_state=0, kind='kmeans'))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax[0]) diff --git a/examples/over-sampling/plot_smote.py b/examples/over-sampling/plot_smote.py index b4fe22d3e..96d3053be 100644 --- a/examples/over-sampling/plot_smote.py +++ b/examples/over-sampling/plot_smote.py @@ -49,7 +49,7 @@ def plot_resampling(ax, X, y, title): X_vis = pca.fit_transform(X) # Apply regular SMOTE -kind = ['regular', 'borderline1', 'borderline2', 'svm'] +kind = ['regular', 'borderline1', 'borderline2', 'svm', 'kmeans'] sm = [SMOTE(kind=k) for k in kind] X_resampled = [] y_resampled = [] @@ -61,12 +61,15 @@ def plot_resampling(ax, X, y, title): X_res_vis.append(pca.transform(X_res)) # Two subplots, unpack the axes array immediately -f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2) -# Remove axis for second plot +f, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3) +# Remove axis for first and second plot +ax1.axis('off') ax2.axis('off') -ax_res = [ax3, ax4, ax5, ax6] +ax3.axis('off') -c0, c1 = plot_resampling(ax1, X_vis, y, 'Original set') +ax_res = [ax5, ax6, ax7, ax8, ax9] + +c0, c1 = plot_resampling(ax4, X_vis, y, 'Original set') for i in range(len(kind)): plot_resampling(ax_res[i], X_res_vis[i], y_resampled[i], 'SMOTE {}'.format(kind[i])) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 585987afd..aeb53b09f 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -140,7 +140,7 @@ def __init__(self, out_step=0.5, kind='regular', svm_estimator=None, - n_kmeans_clusters=10, + n_kmeans_clusters=25, n_jobs=1, ratio=None): super(SMOTE, self).__init__( @@ -561,7 +561,7 @@ def _find_cluster_sparsity(self, X): non_diag_elements = (len(X) ** 2) - len(X) mean_distance = euclidean_distances.sum() / non_diag_elements - density = len(X) / (mean_distance ** 2) + density = len(X) / (mean_distance ** np.log10(len(X))) sparsity = 1 / density return sparsity From 5dd052686c089527802101449382d781433ceee4 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Thu, 21 Jun 2018 17:45:16 +0200 Subject: [PATCH 04/35] Added error when clustering fails to find a cluster with sufficient samples. --- imblearn/over_sampling/smote.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index aeb53b09f..9ab603046 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -644,6 +644,12 @@ def _sample_kmeans(self, X, y): cs / sum(cluster_sparsities) for cs in cluster_sparsities ] + if not valid_clusters: + raise RuntimeError( + "No clusters found with sufficient samples of" + "class {}.".format(class_sample) + ) + for cluster_n in range(len(valid_clusters)): X_cluster = safe_indexing(X, valid_clusters[cluster_n]) y_cluster = safe_indexing(y, valid_clusters[cluster_n]) From 642e62e5ebc98c6223691c9f6dbc015ce49393d0 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Fri, 22 Jun 2018 11:41:12 +0200 Subject: [PATCH 05/35] Added test for wrong hyperparameters --- imblearn/over_sampling/tests/test_smote.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index e86aae2fd..4175632c6 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -309,3 +309,15 @@ def test_sample_kmeans(): assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) + + +def test_sample_kmeans_wrong_hyperparams(): + kind = 'kmeans' + smote = SMOTE( + random_state=RND_SEED, + kind=kind, + n_kmeans_clusters=10, + k_neighbors=4 + ) + with raises(RuntimeError, match="No clusters found"): + smote.fit_sample(X, Y) From fd663f1606bc128d3801862140fef4cdd21ed2ff Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Fri, 22 Jun 2018 13:48:10 +0200 Subject: [PATCH 06/35] Save an indexing operation if cluster_class_mean is insufficient. --- imblearn/over_sampling/smote.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 9ab603046..102208b26 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -624,6 +624,9 @@ def _sample_kmeans(self, X, y): cluster_class_mean = (y_cluster == class_sample).mean() + if cluster_class_mean < 0.5: + continue + X_cluster_class = safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) @@ -632,9 +635,6 @@ def _sample_kmeans(self, X, y): if len(X_cluster_class) < self.k_neighbors + 1: continue - if cluster_class_mean < 0.5: - continue - valid_clusters.append(cluster_index) cluster_sparsities.append( self._find_cluster_sparsity(X_cluster_class) From 0ef982b450305cd7659a098bd163e6921b6efb02 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Fri, 22 Jun 2018 15:08:22 +0200 Subject: [PATCH 07/35] Simplified vstack function call. --- imblearn/over_sampling/smote.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 102208b26..5f4b506f0 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -674,10 +674,8 @@ def _sample_kmeans(self, X, y): 1.0 ) - if sparse.issparse(X_new): - X_resampled = sparse.vstack([X_resampled, X_new]) - else: - X_resampled = np.vstack((X_resampled, X_new)) + stack = [sparse.vstack, np.vstack][int(sparse.issparse(X_new))] + X_resampled = stack([X_resampled, X_new]) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled From 4c375932cb1e061663fd2e9d9327b299eae3ed06 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Fri, 22 Jun 2018 15:11:03 +0200 Subject: [PATCH 08/35] Resolved stacking error --- imblearn/over_sampling/smote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 5f4b506f0..929edafca 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -674,8 +674,8 @@ def _sample_kmeans(self, X, y): 1.0 ) - stack = [sparse.vstack, np.vstack][int(sparse.issparse(X_new))] - X_resampled = stack([X_resampled, X_new]) + stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))] + X_resampled = stack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled From efb6a7539f0c4fc5066ebce8753509f1084e58c3 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Sun, 1 Jul 2018 15:19:58 +0200 Subject: [PATCH 09/35] Added extra arguments for kmeans sampling, addressed suggestions by Felix Last. --- imblearn/over_sampling/smote.py | 54 +++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 929edafca..c3c965f04 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -7,6 +7,8 @@ from __future__ import division +import math + import numpy as np from scipy import sparse @@ -72,6 +74,16 @@ class SMOTE(BaseOverSampler): If ``kind='kmeans'``, the number of clusters that is the be used by the k-means algorithm for sample identification. + cluster_balance_threshold: float, optional (default=0.5) + If ``kind='kmeans'``, the threshold at which a cluster is called + balanced and where samples of the class selected for SMOTE will be + oversampled. + + density_estimation_exponent: str or float, optional (default="auto") + If ``kind='kmeans'``, this exponent is used to determine the density + of a cluster. Leaving it to 'auto' will use a feature-length based + exponent, but any floating point number will be accepted. + n_jobs : int, optional (default=1) The number of threads to open if possible. @@ -141,6 +153,8 @@ def __init__(self, kind='regular', svm_estimator=None, n_kmeans_clusters=25, + cluster_balance_threshold=0.5, + density_estimation_exponent="auto", n_jobs=1, ratio=None): super(SMOTE, self).__init__( @@ -152,6 +166,8 @@ def __init__(self, self.out_step = out_step self.svm_estimator = svm_estimator self.n_kmeans_clusters = n_kmeans_clusters + self.cluster_balance_threshold = cluster_balance_threshold + self.density_estimation_exponent = density_estimation_exponent self.n_jobs = n_jobs def _in_danger_noise(self, samples, target_class, y, kind='danger'): @@ -548,7 +564,12 @@ def _sample_svm(self, X, y): def _find_cluster_sparsity(self, X): """ Finds the sparsity of a cluster of samples. The sparsity is - calculated according to the method described in [4]_. """ + calculated according to the method described in [4]_. `de` is + specified with the `densitity_estimation_exponent`, which defaults + to 'auto'. With automatic exponent selection, a value is chosen + that closely fits the magnitude in all directions of a unit vector + in that feature space according the formula `log(n, 1.6) ** 1.8 * 0.16` + where n indicates the number of dimensions. """ euclidean_distances = pairwise_distances( X, metric="euclidean", n_jobs=self.n_jobs @@ -561,7 +582,11 @@ def _find_cluster_sparsity(self, X): non_diag_elements = (len(X) ** 2) - len(X) mean_distance = euclidean_distances.sum() / non_diag_elements - density = len(X) / (mean_distance ** np.log10(len(X))) + de = self.density_estimation_exponent + if self.density_estimation_exponent == "auto": + de = math.log(len(X), 1.6) ** 1.8 * 0.16 + + density = len(X) / (mean_distance ** de) sparsity = 1 / density return sparsity @@ -615,7 +640,12 @@ def _sample_kmeans(self, X, y): valid_clusters = [] cluster_sparsities = [] - # Identify clusters where class_sample is the majority + cluster_balance_threshold = self.cluster_balance_threshold + # A specifying a single cluster will act as normal SMOTE + if self.n_kmeans_clusters == 1: + cluster_balance_threshold = float("-inf") + + # Identify valid clusters for cluster_n in range(self.n_kmeans_clusters): cluster_index = np.flatnonzero(X_clusters == cluster_n) @@ -624,7 +654,10 @@ def _sample_kmeans(self, X, y): cluster_class_mean = (y_cluster == class_sample).mean() - if cluster_class_mean < 0.5: + if cluster_class_mean < cluster_balance_threshold: + continue + + if len(X_cluster) < 2: continue X_cluster_class = safe_indexing( @@ -632,9 +665,6 @@ def _sample_kmeans(self, X, y): np.flatnonzero(y_cluster == class_sample) ) - if len(X_cluster_class) < self.k_neighbors + 1: - continue - valid_clusters.append(cluster_index) cluster_sparsities.append( self._find_cluster_sparsity(X_cluster_class) @@ -658,9 +688,15 @@ def _sample_kmeans(self, X, y): X_cluster, np.flatnonzero(y_cluster == class_sample) ) - self.nn_k_.fit(X_cluster_class) + nn_k = check_neighbors_object( + 'k_neighbors', + min(self.k_neighbors, len(X_cluster_class)) - 1, + additional_neighbor=1 + ) - nns = self.nn_k_.kneighbors( + nn_k.fit(X_cluster_class) + + nns = nn_k.kneighbors( X_cluster_class, return_distance=False )[:, 1:] From 131e3b3bafd2a3ae8b56588a7ac921225aed5044 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Sun, 1 Jul 2018 15:33:59 +0200 Subject: [PATCH 10/35] Resolved errors and warnings --- imblearn/over_sampling/smote.py | 18 ++++++++++---- imblearn/over_sampling/tests/test_smote.py | 28 ++++++++++++---------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index c3c965f04..cff75ad66 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -564,11 +564,12 @@ def _sample_svm(self, X, y): def _find_cluster_sparsity(self, X): """ Finds the sparsity of a cluster of samples. The sparsity is - calculated according to the method described in [4]_. `de` is - specified with the `densitity_estimation_exponent`, which defaults + calculated according to the method described in [4]_. ``'de'`` is + specified with the ``'densitity_estimation_exponent'``, which defaults to 'auto'. With automatic exponent selection, a value is chosen that closely fits the magnitude in all directions of a unit vector - in that feature space according the formula `log(n, 1.6) ** 1.8 * 0.16` + in that feature space according the formula + ``'log(n, 1.6) ** 1.8 * 0.16'`` where n indicates the number of dimensions. """ euclidean_distances = pairwise_distances( @@ -657,7 +658,7 @@ def _sample_kmeans(self, X, y): if cluster_class_mean < cluster_balance_threshold: continue - if len(X_cluster) < 2: + if len(X_cluster) < 3: continue X_cluster_class = safe_indexing( @@ -688,12 +689,19 @@ def _sample_kmeans(self, X, y): X_cluster, np.flatnonzero(y_cluster == class_sample) ) + cluster_k_neighbours = min( + self.k_neighbors, + len(X_cluster_class) + ) + nn_k = check_neighbors_object( 'k_neighbors', - min(self.k_neighbors, len(X_cluster_class)) - 1, + cluster_k_neighbours - 1, additional_neighbor=1 ) + print(X_cluster_class, cluster_k_neighbours) + nn_k.fit(X_cluster_class) nns = nn_k.kneighbors( diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 4175632c6..ffe33a05c 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -289,24 +289,26 @@ def test_sample_kmeans(): ) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([ - [0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.98135505, 0.22510669], [0.80404479, -0.27321949], - [0.91314969, -0.376049], [0.82740979, -0.35957365] + [ 0.11622591, -0.0317206 ], [ 0.77481731, 0.60935141], + [ 1.25192108, -0.22367336], [ 0.53366841, -0.30312976], + [ 1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [ 0.83680821, 1.72827342], [ 0.3084254, 0.33299982], + [ 0.70472253, -0.73309052], [ 0.28893132, -0.38761769], + [ 1.15514042, 0.0129463 ], [ 0.88407872, 0.35454207], + [ 1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [ 0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [ 0.08711622, 0.93259929], [ 1.70580611, -0.11219234], + [ 1.06712799, -0.21623093], [ 0.94762989, -0.25733545], + [ 0.80963795, -0.52386435], [ 0.97198024, -0.65371403] ]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) + print(X_resampled) + assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -316,7 +318,7 @@ def test_sample_kmeans_wrong_hyperparams(): smote = SMOTE( random_state=RND_SEED, kind=kind, - n_kmeans_clusters=10, + n_kmeans_clusters=2, k_neighbors=4 ) with raises(RuntimeError, match="No clusters found"): From 7de59514becfc476d4b641c78746edc829ce382e Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Sun, 1 Jul 2018 21:39:26 +0200 Subject: [PATCH 11/35] Resolve PEP8 style issues --- imblearn/over_sampling/tests/test_smote.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index ffe33a05c..b198d5c95 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -289,18 +289,18 @@ def test_sample_kmeans(): ) X_resampled, y_resampled = smote.fit_sample(X, Y) X_gt = np.array([ - [ 0.11622591, -0.0317206 ], [ 0.77481731, 0.60935141], - [ 1.25192108, -0.22367336], [ 0.53366841, -0.30312976], - [ 1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [ 0.83680821, 1.72827342], [ 0.3084254, 0.33299982], - [ 0.70472253, -0.73309052], [ 0.28893132, -0.38761769], - [ 1.15514042, 0.0129463 ], [ 0.88407872, 0.35454207], - [ 1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [ 0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [ 0.08711622, 0.93259929], [ 1.70580611, -0.11219234], - [ 1.06712799, -0.21623093], [ 0.94762989, -0.25733545], - [ 0.80963795, -0.52386435], [ 0.97198024, -0.65371403] + [0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [1.06712799, -0.21623093], [0.94762989, -0.25733545], + [0.80963795, -0.52386435], [0.97198024, -0.65371403] ]) y_gt = np.array([ From 7029266166b266de5cc0a1a4c422627b7f5d9b61 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Thu, 12 Jul 2018 15:20:01 +0200 Subject: [PATCH 12/35] Added special k-means cases and tests. --- imblearn/over_sampling/smote.py | 11 +--- imblearn/over_sampling/tests/test_smote.py | 69 +++++++++++++++++++++- 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index cff75ad66..9b01d3db1 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -658,9 +658,6 @@ def _sample_kmeans(self, X, y): if cluster_class_mean < cluster_balance_threshold: continue - if len(X_cluster) < 3: - continue - X_cluster_class = safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample) @@ -677,7 +674,7 @@ def _sample_kmeans(self, X, y): if not valid_clusters: raise RuntimeError( - "No clusters found with sufficient samples of" + "No clusters found with sufficient samples of " "class {}.".format(class_sample) ) @@ -696,19 +693,17 @@ def _sample_kmeans(self, X, y): nn_k = check_neighbors_object( 'k_neighbors', - cluster_k_neighbours - 1, + cluster_k_neighbours, additional_neighbor=1 ) - print(X_cluster_class, cluster_k_neighbours) - nn_k.fit(X_cluster_class) - nns = nn_k.kneighbors( X_cluster_class, return_distance=False )[:, 1:] c_n_samples = int(n_samples * cluster_weights[cluster_n]) + X_new, y_new = self._make_samples( X_cluster_class, class_sample, diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index b198d5c95..dda04dbef 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -299,20 +299,83 @@ def test_sample_kmeans(): [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.06712799, -0.21623093], [0.94762989, -0.25733545], - [0.80963795, -0.52386435], [0.97198024, -0.65371403] + [0.98135505, 0.22510668], [0.80404478, -0.2732194], + [0.91314969, -0.37604899], [0.82740979, -0.35957364] ]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) - print(X_resampled) + assert_allclose(X_resampled, X_gt, rtol=R_TOL) + assert_array_equal(y_resampled, y_gt) + + +def test_sample_kmeans_density_estimation(): + kind = 'kmeans' + smote = SMOTE( + random_state=RND_SEED, + kind=kind, + n_kmeans_clusters=3, + k_neighbors=2, + density_estimation_exponent=2 + ) + X_resampled, y_resampled = smote.fit_sample(X, Y) + X_gt = np.array([ + [0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [0.98135505, 0.22510668], [0.80404478, -0.2732194], + [0.91314969, -0.37604899], [0.82740979, -0.35957364] + ]) + + y_gt = np.array([ + 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 + ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) +def test_sample_kmeans_regular(): + """ KMeans sampling with a cluster size of 1 should result in regular + SMOTE.""" + kind = 'kmeans' + smote = SMOTE( + random_state=RND_SEED, + kind=kind, + n_kmeans_clusters=1, + ) + X_resampled, y_resampled = smote.fit_sample(X, Y) + X_gt = np.array([ + [0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [0.29307743, -0.14670439], [0.84976473, -0.15570176], + [0.61319159, -0.11571668], [0.66052536, -0.28246517] + ]) + + y_gt = np.array([ + 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 + ]) + assert_allclose(X_resampled, X_gt, rtol=R_TOL) + assert_array_equal(y_resampled, y_gt) + + def test_sample_kmeans_wrong_hyperparams(): kind = 'kmeans' smote = SMOTE( From c5ab59c50e746f5924547a1f8bd7979fd2d680ec Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Sun, 3 Mar 2019 11:41:25 +0100 Subject: [PATCH 13/35] Removed KMeans specific code --- .../plot_comparison_over_sampling.py | 4 +- imblearn/over_sampling/_smote.py | 157 ------------------ imblearn/over_sampling/tests/test_smote.py | 68 -------- 3 files changed, 1 insertion(+), 228 deletions(-) diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index e5d70cf92..3df5979da 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -21,8 +21,7 @@ from imblearn.pipeline import make_pipeline from imblearn.over_sampling import ADASYN -from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, KMeansSMOTE, - SVMSMOTE) +from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE) from imblearn.over_sampling import RandomOverSampler from imblearn.base import BaseSampler @@ -219,7 +218,6 @@ def _fit_resample(self, X, y): (SMOTE(random_state=0), BorderlineSMOTE(random_state=0, kind='borderline-1'), BorderlineSMOTE(random_state=0, kind='borderline-2'), - KMeansSMOTE(kmeans_estimator=3, random_state=0), SVMSMOTE(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 118fec0df..70dfbdf2f 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -772,160 +772,3 @@ def _sample(self, X, y): return X_resampled, y_resampled -@Substitution( - sampling_strategy=BaseOverSampler._sampling_strategy_docstring, - random_state=_random_state_docstring) -class KMeansSMOTE(BaseSMOTE): - """Apply a KMeans clustering before to over-sample using SMOTE. - - This is an implementation of the algorithm described in [1]_. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - {sampling_strategy} - - {random_state} - - k_neighbors : int or object, optional (default=5) - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. - - n_jobs : int, optional (default=1) - The number of threads to open if possible. - - kmeans_estimator : int or object, optional (default=KMeans()) - A KMeans instance or the number of clusters. - - cluster_balance_threshold: float, optional (default=0.5) - The threshold at which a cluster is called balanced and where samples - of the class selected for SMOTE will be oversampled. - - density_exponent: str or float, optional (default="auto") - This exponent is used to determine the density of a cluster. Leaving it - to 'auto' will use a feature-length based exponent. - - """ - def __init__(self, - sampling_strategy='auto', - random_state=None, - k_neighbors=5, - n_jobs=1, - kmeans_estimator=None, - cluster_balance_threshold=0.5, - density_exponent="auto"): - super(KMeansSMOTE, self).__init__( - sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, n_jobs=n_jobs, ratio=None) - self.kmeans_estimator = kmeans_estimator - self.cluster_balance_threshold = cluster_balance_threshold - self.density_exponent = density_exponent - - def _validate_estimator(self): - super(KMeansSMOTE, self)._validate_estimator() - if self.kmeans_estimator is None: - self.kmeans_estimator_ = KMeans( - random_state=self.random_state) - elif isinstance(self.kmeans_estimator, Integral): - self.kmeans_estimator_ = KMeans( - n_clusters=self.kmeans_estimator, - random_state=self.random_state) - else: - self.kmeans_estimator_ = clone(self.kmeans_estimator) - - self.cluster_balance_threshold_ = ( - self.cluster_balance_threshold - if self.kmeans_estimator_.n_clusters != 1 else -np.inf) - - def _find_cluster_sparsity(self, X): - euclidean_distances = pairwise_distances(X, metric="euclidean", - n_jobs=self.n_jobs) - - # negate diagonal elements - for ind in range(X.shape[0]): - euclidean_distances[ind, ind] = 0 - - non_diag_elements = (len(X) ** 2) - len(X) - mean_distance = euclidean_distances.sum() / non_diag_elements - exponent = (math.log(len(X), 1.6) ** 1.8 * 0.16 - if self.density_exponent == 'auto' - else self.density_exponent) - return (mean_distance ** exponent) / X.shape[0] - - def _sample(self, X, y): - self._validate_estimator() - random_state = check_random_state(self.random_state) - X_resampled = X.copy() - y_resampled = y.copy() - - for class_sample, n_samples in self.sampling_strategy_.items(): - if n_samples == 0: - continue - target_class_indices = np.flatnonzero(y == class_sample) - X_class = safe_indexing(X, target_class_indices) - - X_clusters = self.kmeans_estimator_.fit_predict(X) - - valid_clusters = [] - cluster_sparsities = [] - - # Identify valid clusters - for cluster_idx in range(self.kmeans_estimator_.n_clusters): - cluster_mask = np.flatnonzero(X_clusters == cluster_idx) - - X_cluster = safe_indexing(X, cluster_mask) - y_cluster = safe_indexing(y, cluster_mask) - - cluster_class_mean = (y_cluster == class_sample).mean() - - if cluster_class_mean < self.cluster_balance_threshold_: - continue - - X_cluster_class = safe_indexing( - X_cluster, np.flatnonzero(y_cluster == class_sample)) - - valid_clusters.append(cluster_mask) - cluster_sparsities.append( - self._find_cluster_sparsity(X_cluster_class)) - - cluster_sparsities = np.array(cluster_sparsities) - cluster_weights = cluster_sparsities / cluster_sparsities.sum() - - if not valid_clusters: - raise RuntimeError( - "No clusters found with sufficient samples of " - "class {}.".format(class_sample)) - - for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): - X_cluster = safe_indexing(X, valid_cluster) - y_cluster = safe_indexing(y, valid_cluster) - - X_cluster_class = safe_indexing( - X_cluster, np.flatnonzero(y_cluster == class_sample)) - - cluster_k_neighbours = min(self.nn_k_.n_neighbors, - len(X_cluster_class) + 1) - self.nn_k_.set_params(n_neighbors=cluster_k_neighbours) - - self.nn_k_.fit(X_cluster_class) - nns = self.nn_k_.kneighbors(X_cluster_class, - return_distance=False)[:, 1:] - - cluster_n_samples = int( - n_samples * cluster_weights[valid_cluster_idx]) - - X_new, y_new = self._make_samples(X_cluster_class, - class_sample, - X_class, - nns, - cluster_n_samples, - 1.0) - - stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))] - X_resampled = stack((X_resampled, X_new)) - y_resampled = np.hstack((y_resampled, y_new)) - - return X_resampled, y_resampled diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 7c253f556..c103fdacd 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -322,71 +322,3 @@ def test_svm_smote(): assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) - - -def test_kmeans_smote(): - kmeans_smote = KMeansSMOTE(kmeans_estimator=1, random_state=42) - smote = SMOTE(random_state=42) - - X_res_1, y_res_1 = kmeans_smote.fit_sample(X, Y) - X_res_2, y_res_2 = smote.fit_sample(X, Y) - - assert_allclose(X_res_1, X_res_2) - assert_array_equal(y_res_1, y_res_2) - - -def test_sample_kmeans(): - smote = KMeansSMOTE(random_state=RND_SEED, - kmeans_estimator=3, - k_neighbors=2) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([ - [0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.98135505, 0.22510668], [0.80404478, -0.2732194], - [0.91314969, -0.37604899], [0.82740979, -0.35957364] - ]) - - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) - - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) - - -def test_sample_kmeans_density_estimation(): - smote = KMeansSMOTE(random_state=RND_SEED, - kmeans_estimator=3, - k_neighbors=2, - density_exponent=2) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([ - [0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [0.98135505, 0.22510668], [0.80404478, -0.2732194], - [0.91314969, -0.37604899], [0.82740979, -0.35957364] - ]) - - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) - - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) From 950df34da634f8d6bc86ab9b365cbc9338232440 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Sun, 3 Mar 2019 12:57:00 +0100 Subject: [PATCH 14/35] Restored KMeansSMOTE --- .../plot_comparison_over_sampling.py | 4 +- imblearn/over_sampling/__init__.py | 2 +- imblearn/over_sampling/_smote.py | 165 ++++++++++++++++++ .../over_sampling/tests/test_kmeans_smote.py | 93 ++++++++++ 4 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 imblearn/over_sampling/tests/test_kmeans_smote.py diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index 50e3f22d1..8159ccd26 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -21,7 +21,8 @@ from imblearn.pipeline import make_pipeline from imblearn.over_sampling import ADASYN -from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC +from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, + KMeansSMOTE) from imblearn.over_sampling import RandomOverSampler from imblearn.base import BaseSampler @@ -218,6 +219,7 @@ def _fit_resample(self, X, y): (SMOTE(random_state=0), BorderlineSMOTE(random_state=0, kind='borderline-1'), BorderlineSMOTE(random_state=0, kind='borderline-2'), + KMeansSMOTE(random_state=0, kmeans_estimator=25), SVMSMOTE(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py index bb834b0b5..63abf3dc0 100644 --- a/imblearn/over_sampling/__init__.py +++ b/imblearn/over_sampling/__init__.py @@ -11,5 +11,5 @@ from ._smote import SVMSMOTE from ._smote import SMOTENC -__all__ = ['ADASYN', 'RandomOverSampler', +__all__ = ['ADASYN', 'RandomOverSampler', 'KMeansSMOTE', 'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTENC'] diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 8bef6f8ca..e45ebe45c 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -17,6 +17,8 @@ from scipy import sparse from sklearn.base import clone +from sklearn.cluster import MiniBatchKMeans as KMeans +from sklearn.metrics import pairwise_distances from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC from sklearn.utils import check_random_state @@ -1091,3 +1093,166 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step): sample[start_idx + col_sel] = 1 return sparse.csr_matrix(sample) if sparse.issparse(X) else sample + + +# @Substitution( +# sampling_strategy=BaseOverSampler._sampling_strategy_docstring, +# random_state=_random_state_docstring) +class KMeansSMOTE(BaseSMOTE): + """Apply a KMeans clustering before to over-sample using SMOTE. + + This is an implementation of the algorithm described in [1]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + {sampling_strategy} + + {random_state} + + k_neighbors : int or object, optional (default=5) + If ``int``, number of nearest neighbours to used to construct synthetic + samples. If object, an estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. + + n_jobs : int, optional (default=1) + The number of threads to open if possible. + + kmeans_estimator : int or object, optional (default=KMeans()) + A KMeans instance or the number of clusters. + + cluster_balance_threshold: float, optional (default=0.5) + The threshold at which a cluster is called balanced and where samples + of the class selected for SMOTE will be oversampled. + + density_exponent: str or float, optional (default="auto") + This exponent is used to determine the density of a cluster. Leaving it + to 'auto' will use a feature-length based exponent. + + """ + def __init__(self, + sampling_strategy='auto', + random_state=None, + k_neighbors=5, + n_jobs=1, + kmeans_estimator=None, + cluster_balance_threshold=0.5, + density_exponent="auto"): + super(KMeansSMOTE, self).__init__( + sampling_strategy=sampling_strategy, random_state=random_state, + k_neighbors=k_neighbors, n_jobs=n_jobs, ratio=None) + self.kmeans_estimator = kmeans_estimator + self.cluster_balance_threshold = cluster_balance_threshold + self.density_exponent = density_exponent + + def _validate_estimator(self): + super(KMeansSMOTE, self)._validate_estimator() + if self.kmeans_estimator is None: + self.kmeans_estimator_ = KMeans( + random_state=self.random_state) + elif isinstance(self.kmeans_estimator, int): + self.kmeans_estimator_ = KMeans( + n_clusters=self.kmeans_estimator, + random_state=self.random_state) + else: + self.kmeans_estimator_ = clone(self.kmeans_estimator) + + self.cluster_balance_threshold_ = ( + self.cluster_balance_threshold + if self.kmeans_estimator_.n_clusters != 1 else -np.inf) + + def _find_cluster_sparsity(self, X): + euclidean_distances = pairwise_distances(X, metric="euclidean", + n_jobs=self.n_jobs) + + # negate diagonal elements + for ind in range(X.shape[0]): + euclidean_distances[ind, ind] = 0 + + non_diag_elements = (len(X) ** 2) - len(X) + mean_distance = euclidean_distances.sum() / non_diag_elements + exponent = (math.log(len(X), 1.6) ** 1.8 * 0.16 + if self.density_exponent == 'auto' + else self.density_exponent) + return (mean_distance ** exponent) / X.shape[0] + + def _sample(self, X, y): + return self._fit_resample(X, y) + + def _fit_resample(self, X, y): + self._validate_estimator() + random_state = check_random_state(self.random_state) + X_resampled = X.copy() + y_resampled = y.copy() + + for class_sample, n_samples in self.sampling_strategy_.items(): + if n_samples == 0: + continue + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) + + X_clusters = self.kmeans_estimator_.fit_predict(X) + + valid_clusters = [] + cluster_sparsities = [] + + # Identify valid clusters + for cluster_idx in range(self.kmeans_estimator_.n_clusters): + cluster_mask = np.flatnonzero(X_clusters == cluster_idx) + + X_cluster = safe_indexing(X, cluster_mask) + y_cluster = safe_indexing(y, cluster_mask) + + cluster_class_mean = (y_cluster == class_sample).mean() + + if cluster_class_mean < self.cluster_balance_threshold_: + continue + + X_cluster_class = safe_indexing( + X_cluster, np.flatnonzero(y_cluster == class_sample)) + + valid_clusters.append(cluster_mask) + cluster_sparsities.append( + self._find_cluster_sparsity(X_cluster_class)) + + cluster_sparsities = np.array(cluster_sparsities) + cluster_weights = cluster_sparsities / cluster_sparsities.sum() + + if not valid_clusters: + raise RuntimeError( + "No clusters found with sufficient samples of " + "class {}.".format(class_sample)) + + for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): + X_cluster = safe_indexing(X, valid_cluster) + y_cluster = safe_indexing(y, valid_cluster) + + X_cluster_class = safe_indexing( + X_cluster, np.flatnonzero(y_cluster == class_sample)) + + cluster_k_neighbours = min(self.nn_k_.n_neighbors, + len(X_cluster_class) + 1) + self.nn_k_.set_params(n_neighbors=cluster_k_neighbours) + + self.nn_k_.fit(X_cluster_class) + nns = self.nn_k_.kneighbors(X_cluster_class, + return_distance=False)[:, 1:] + + cluster_n_samples = int( + n_samples * cluster_weights[valid_cluster_idx]) + + X_new, y_new = self._make_samples(X_cluster_class, + y.dtype, + class_sample, + X_class, + nns, + cluster_n_samples, + 1.0) + + stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))] + X_resampled = stack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) + + return X_resampled, y_resampled diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py new file mode 100644 index 000000000..0d2cb9f63 --- /dev/null +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -0,0 +1,93 @@ +import pytest +import numpy as np + +from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import assert_array_equal + +from imblearn.over_sampling import (KMeansSMOTE, SMOTE) + + +@pytest.fixture +def data(): + X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) + y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) + return X, y + +def test_kmeans_smote(data): + X, y = data + kmeans_smote = KMeansSMOTE(kmeans_estimator=1, random_state=42) + smote = SMOTE(random_state=42) + + X_res_1, y_res_1 = kmeans_smote.fit_sample(X, y) + X_res_2, y_res_2 = smote.fit_sample(X, y) + + assert_allclose(X_res_1, X_res_2) + assert_array_equal(y_res_1, y_res_2) + + +def test_sample_kmeans(data): + X, y = data + smote = KMeansSMOTE(random_state=42, + kmeans_estimator=3, + k_neighbors=2) + X_resampled, y_resampled = smote.fit_sample(X, y) + X_gt = np.array([ + [0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [1.19141841, -0.82923193], [0.687674179, -0.3327227441], + [1.24349671, -0.87451605], [0.3042074282, -0.093428711] + ]) + + y_gt = np.array([ + 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 + ]) + + assert_allclose(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) + + +def test_sample_kmeans_density_estimation(data): + X, y = data + smote = KMeansSMOTE(random_state=42, + kmeans_estimator=3, + k_neighbors=2, + density_exponent=2) + X_resampled, y_resampled = smote.fit_sample(X, y) + X_gt = np.array([ + [0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [1.19141841, -0.82923193], [0.687674179, -0.3327227441], + [1.24349671, -0.87451605], [0.3042074282, -0.093428711] + ]) + + y_gt = np.array([ + 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 + ]) + + assert_allclose(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) From 2851b7e62a523c3ddc3124a5d0a846b76212da7b Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 4 Mar 2019 21:08:13 +0100 Subject: [PATCH 15/35] Resolved KMeansSmote errors --- .../plot_comparison_over_sampling.py | 2 +- imblearn/over_sampling/_smote.py | 64 +++++++++++++------ .../over_sampling/tests/test_kmeans_smote.py | 7 +- imblearn/utils/estimator_checks.py | 1 + 4 files changed, 53 insertions(+), 21 deletions(-) diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index 8159ccd26..2d0632641 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -219,7 +219,7 @@ def _fit_resample(self, X, y): (SMOTE(random_state=0), BorderlineSMOTE(random_state=0, kind='borderline-1'), BorderlineSMOTE(random_state=0, kind='borderline-2'), - KMeansSMOTE(random_state=0, kmeans_estimator=25), + KMeansSMOTE(random_state=0), SVMSMOTE(random_state=0))): clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index e45ebe45c..6fe8b2b88 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1095,9 +1095,9 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step): return sparse.csr_matrix(sample) if sparse.issparse(X) else sample -# @Substitution( -# sampling_strategy=BaseOverSampler._sampling_strategy_docstring, -# random_state=_random_state_docstring) +@Substitution( + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, + random_state=_random_state_docstring) class KMeansSMOTE(BaseSMOTE): """Apply a KMeans clustering before to over-sample using SMOTE. @@ -1111,7 +1111,7 @@ class KMeansSMOTE(BaseSMOTE): {random_state} - k_neighbors : int or object, optional (default=5) + k_neighbors : int or object, optional (default=2) If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to @@ -1121,28 +1121,38 @@ class KMeansSMOTE(BaseSMOTE): The number of threads to open if possible. kmeans_estimator : int or object, optional (default=KMeans()) - A KMeans instance or the number of clusters. + A KMeans instance or the number of clusters to be used. - cluster_balance_threshold: float, optional (default=0.5) + cluster_balance_threshold: str or float, optional (default="auto") The threshold at which a cluster is called balanced and where samples - of the class selected for SMOTE will be oversampled. + of the class selected for SMOTE will be oversampled. If "auto", this + will be determined by the ratio for each class, or it can be set + manually. density_exponent: str or float, optional (default="auto") This exponent is used to determine the density of a cluster. Leaving it to 'auto' will use a feature-length based exponent. + ratio : str, dict, or callable + .. deprecated:: 0.4 + Use the parameter ``sampling_strategy`` instead. It will be removed + in 0.6. + """ def __init__(self, sampling_strategy='auto', random_state=None, - k_neighbors=5, + k_neighbors=2, n_jobs=1, kmeans_estimator=None, - cluster_balance_threshold=0.5, + ratio=None, + cluster_balance_threshold="auto", density_exponent="auto"): + super(KMeansSMOTE, self).__init__( sampling_strategy=sampling_strategy, random_state=random_state, - k_neighbors=k_neighbors, n_jobs=n_jobs, ratio=None) + k_neighbors=k_neighbors, n_jobs=n_jobs) + self.kmeans_estimator = kmeans_estimator self.cluster_balance_threshold = cluster_balance_threshold self.density_exponent = density_exponent @@ -1171,9 +1181,9 @@ def _find_cluster_sparsity(self, X): for ind in range(X.shape[0]): euclidean_distances[ind, ind] = 0 - non_diag_elements = (len(X) ** 2) - len(X) + non_diag_elements = (X.shape[0] ** 2) - X.shape[0] mean_distance = euclidean_distances.sum() / non_diag_elements - exponent = (math.log(len(X), 1.6) ** 1.8 * 0.16 + exponent = (math.log(X.shape[0], 1.6) ** 1.8 * 0.16 if self.density_exponent == 'auto' else self.density_exponent) return (mean_distance ** exponent) / X.shape[0] @@ -1183,13 +1193,14 @@ def _sample(self, X, y): def _fit_resample(self, X, y): self._validate_estimator() - random_state = check_random_state(self.random_state) + check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue + target_class_indices = np.flatnonzero(y == class_sample) X_class = safe_indexing(X, target_class_indices) @@ -1207,7 +1218,17 @@ def _fit_resample(self, X, y): cluster_class_mean = (y_cluster == class_sample).mean() - if cluster_class_mean < self.cluster_balance_threshold_: + if self.cluster_balance_threshold == "auto": + balance_threshold = n_samples / sum(self.sampling_strategy_.values()) / 2 + else: + balance_threshold = self.cluster_balance_threshold + + print(cluster_class_mean, balance_threshold, cluster_class_mean < balance_threshold, X_cluster.shape) + + if cluster_class_mean < balance_threshold: + continue + + if cluster_class_mean * X_cluster.shape[0] < self.nn_k_.n_neighbors: continue X_cluster_class = safe_indexing( @@ -1232,17 +1253,17 @@ def _fit_resample(self, X, y): X_cluster_class = safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample)) - cluster_k_neighbours = min(self.nn_k_.n_neighbors, - len(X_cluster_class) + 1) - self.nn_k_.set_params(n_neighbors=cluster_k_neighbours) - self.nn_k_.fit(X_cluster_class) + nns = self.nn_k_.kneighbors(X_cluster_class, return_distance=False)[:, 1:] - cluster_n_samples = int( + cluster_n_samples = math.ceil( n_samples * cluster_weights[valid_cluster_idx]) + if cluster_n_samples < 1: + continue # Weight of this cluster is not high enough + X_new, y_new = self._make_samples(X_cluster_class, y.dtype, class_sample, @@ -1255,4 +1276,9 @@ def _fit_resample(self, X, y): X_resampled = stack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) + + print(self.sampling_strategy_) + print(Counter(y_resampled)) + print(X_resampled.shape, y_resampled.shape) + return X_resampled, y_resampled diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 0d2cb9f63..a0f076e1a 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -4,6 +4,8 @@ from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal +from sklearn.datasets import make_classification + from imblearn.over_sampling import (KMeansSMOTE, SMOTE) @@ -24,7 +26,10 @@ def data(): def test_kmeans_smote(data): X, y = data - kmeans_smote = KMeansSMOTE(kmeans_estimator=1, random_state=42) + kmeans_smote = KMeansSMOTE(kmeans_estimator=1, + random_state=42, + cluster_balance_threshold=0.0, + k_neighbors=5) smote = SMOTE(random_state=42) X_res_1, y_res_1 = kmeans_smote.fit_sample(X, y) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 7d08f3313..13b14affd 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -158,6 +158,7 @@ def check_samplers_one_label(name, Sampler): def check_samplers_fit(name, Sampler): sampler = Sampler() + np.random.seed(42) # Make this test reproducible X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) sampler.fit_resample(X, y) From 25cd90bbd246aab81e3d62ff087efb692b46e716 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Tue, 5 Mar 2019 08:21:10 +0100 Subject: [PATCH 16/35] Resolved python2.7 errors --- imblearn/over_sampling/_smote.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 6fe8b2b88..d97adf103 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1223,8 +1223,6 @@ def _fit_resample(self, X, y): else: balance_threshold = self.cluster_balance_threshold - print(cluster_class_mean, balance_threshold, cluster_class_mean < balance_threshold, X_cluster.shape) - if cluster_class_mean < balance_threshold: continue @@ -1258,8 +1256,8 @@ def _fit_resample(self, X, y): nns = self.nn_k_.kneighbors(X_cluster_class, return_distance=False)[:, 1:] - cluster_n_samples = math.ceil( - n_samples * cluster_weights[valid_cluster_idx]) + cluster_n_samples = int(math.ceil( + n_samples * cluster_weights[valid_cluster_idx])) if cluster_n_samples < 1: continue # Weight of this cluster is not high enough @@ -1277,8 +1275,4 @@ def _fit_resample(self, X, y): y_resampled = np.hstack((y_resampled, y_new)) - print(self.sampling_strategy_) - print(Counter(y_resampled)) - print(X_resampled.shape, y_resampled.shape) - return X_resampled, y_resampled From 750decc94fc0bce7cda9d03f6dfc2113acd56c7d Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Tue, 5 Mar 2019 08:36:27 +0100 Subject: [PATCH 17/35] improved code coverage --- imblearn/over_sampling/_smote.py | 3 -- .../over_sampling/tests/test_kmeans_smote.py | 40 +++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index d97adf103..05f849ab2 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1259,9 +1259,6 @@ def _fit_resample(self, X, y): cluster_n_samples = int(math.ceil( n_samples * cluster_weights[valid_cluster_idx])) - if cluster_n_samples < 1: - continue # Weight of this cluster is not high enough - X_new, y_new = self._make_samples(X_cluster_class, y.dtype, class_sample, diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index a0f076e1a..601a01474 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -3,6 +3,7 @@ from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal +from sklearn.cluster import MiniBatchKMeans from sklearn.datasets import make_classification @@ -38,6 +39,45 @@ def test_kmeans_smote(data): assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) +def test_sample_kmeans_custom(data): + X, y = data + smote = KMeansSMOTE(random_state=42, + kmeans_estimator=MiniBatchKMeans(n_clusters=3), + k_neighbors=2) + X_resampled, y_resampled = smote.fit_sample(X, y) + X_gt = np.array([ + [0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234], + [1.19141841, -0.82923193], [0.687674179, -0.3327227441], + [1.24349671, -0.87451605], [0.3042074282, -0.093428711] + ]) + + y_gt = np.array([ + 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 + ]) + + assert_allclose(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) + +def test_sample_kmeans_not_enough_clusters(data): + np.random.seed(42) # Make this test reproducible + X = np.random.random((30, 2)) + y = np.array([1] * 20 + [0] * 10) + + smote = KMeansSMOTE(random_state=42, + kmeans_estimator=30, + k_neighbors=2) + with pytest.raises(RuntimeError): + smote.fit_sample(X, y) + def test_sample_kmeans(data): X, y = data From b2b766d798c77f9c65af0932958135c466986814 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Tue, 5 Mar 2019 08:51:39 +0100 Subject: [PATCH 18/35] Resolved test error resulting from coverage improvement --- imblearn/over_sampling/_smote.py | 3 +++ imblearn/over_sampling/tests/test_kmeans_smote.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 05f849ab2..d97adf103 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1259,6 +1259,9 @@ def _fit_resample(self, X, y): cluster_n_samples = int(math.ceil( n_samples * cluster_weights[valid_cluster_idx])) + if cluster_n_samples < 1: + continue # Weight of this cluster is not high enough + X_new, y_new = self._make_samples(X_cluster_class, y.dtype, class_sample, diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 601a01474..6000220ff 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -68,7 +68,7 @@ def test_sample_kmeans_custom(data): assert_array_equal(y_resampled, y_gt) def test_sample_kmeans_not_enough_clusters(data): - np.random.seed(42) # Make this test reproducible + np.random.seed(42) X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) From 0358d0f2ae3393a5c0c906c6096c25a11ff9b5a9 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Tue, 5 Mar 2019 09:03:37 +0100 Subject: [PATCH 19/35] Made custom kmeans test deterministic --- imblearn/over_sampling/_smote.py | 7 ++++--- imblearn/over_sampling/tests/test_kmeans_smote.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index d97adf103..5948af1e4 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1188,10 +1188,11 @@ def _find_cluster_sparsity(self, X): else self.density_exponent) return (mean_distance ** exponent) / X.shape[0] - def _sample(self, X, y): - return self._fit_resample(X, y) - + # FIXME: rename _sample -> _fit_resample in 0.6 def _fit_resample(self, X, y): + return self._sample(X, y) + + def _sample(self, X, y): self._validate_estimator() check_random_state(self.random_state) X_resampled = X.copy() diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 6000220ff..42ffb1afb 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -42,7 +42,7 @@ def test_kmeans_smote(data): def test_sample_kmeans_custom(data): X, y = data smote = KMeansSMOTE(random_state=42, - kmeans_estimator=MiniBatchKMeans(n_clusters=3), + kmeans_estimator=MiniBatchKMeans(n_clusters=3, random_state=42), k_neighbors=2) X_resampled, y_resampled = smote.fit_sample(X, y) X_gt = np.array([ @@ -68,7 +68,7 @@ def test_sample_kmeans_custom(data): assert_array_equal(y_resampled, y_gt) def test_sample_kmeans_not_enough_clusters(data): - np.random.seed(42) + np.random.seed(42) X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) From 01f31d04322b5cb2cd9749703f3b29e734929079 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Tue, 5 Mar 2019 09:12:13 +0100 Subject: [PATCH 20/35] Removed superfluous check --- imblearn/over_sampling/_smote.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 5948af1e4..da9341380 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1260,8 +1260,6 @@ def _sample(self, X, y): cluster_n_samples = int(math.ceil( n_samples * cluster_weights[valid_cluster_idx])) - if cluster_n_samples < 1: - continue # Weight of this cluster is not high enough X_new, y_new = self._make_samples(X_cluster_class, y.dtype, From 7aa6f864e0c7b94dc095cbde6d95b5fef1c6a03d Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Thu, 7 Mar 2019 12:19:18 +0100 Subject: [PATCH 21/35] Change test to use custom KMeans instance (MiniBatchKmeans was default) --- imblearn/over_sampling/tests/test_kmeans_smote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 42ffb1afb..6ecc8f6fb 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -3,7 +3,7 @@ from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal -from sklearn.cluster import MiniBatchKMeans +from sklearn.cluster import KMeans from sklearn.datasets import make_classification @@ -42,7 +42,7 @@ def test_kmeans_smote(data): def test_sample_kmeans_custom(data): X, y = data smote = KMeansSMOTE(random_state=42, - kmeans_estimator=MiniBatchKMeans(n_clusters=3, random_state=42), + kmeans_estimator=KMeans(n_clusters=3, random_state=42), k_neighbors=2) X_resampled, y_resampled = smote.fit_sample(X, y) X_gt = np.array([ From 1f34912ae7e0206e146534ea96d1677d00995011 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Sun, 10 Mar 2019 13:45:59 +0100 Subject: [PATCH 22/35] Resolved PEP8 issues --- examples/over-sampling/plot_comparison_over_sampling.py | 2 +- imblearn/over_sampling/_smote.py | 8 ++++---- imblearn/over_sampling/tests/test_kmeans_smote.py | 5 +++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index 2d0632641..ced532ec7 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -22,7 +22,7 @@ from imblearn.pipeline import make_pipeline from imblearn.over_sampling import ADASYN from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, - KMeansSMOTE) + KMeansSMOTE) from imblearn.over_sampling import RandomOverSampler from imblearn.base import BaseSampler diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index da9341380..a364405a1 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1197,6 +1197,7 @@ def _sample(self, X, y): check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() + total_inp_samples = sum(self.sampling_strategy_.values()) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: @@ -1220,14 +1221,15 @@ def _sample(self, X, y): cluster_class_mean = (y_cluster == class_sample).mean() if self.cluster_balance_threshold == "auto": - balance_threshold = n_samples / sum(self.sampling_strategy_.values()) / 2 + balance_threshold = n_samples / total_inp_samples / 2 else: balance_threshold = self.cluster_balance_threshold if cluster_class_mean < balance_threshold: continue - if cluster_class_mean * X_cluster.shape[0] < self.nn_k_.n_neighbors: + anticipated_samples = cluster_class_mean * X_cluster.shape[0] + if total_inp_samples < self.nn_k_.n_neighbors: continue X_cluster_class = safe_indexing( @@ -1260,7 +1262,6 @@ def _sample(self, X, y): cluster_n_samples = int(math.ceil( n_samples * cluster_weights[valid_cluster_idx])) - X_new, y_new = self._make_samples(X_cluster_class, y.dtype, class_sample, @@ -1273,5 +1274,4 @@ def _sample(self, X, y): X_resampled = stack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) - return X_resampled, y_resampled diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 42ffb1afb..9b921cddc 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -5,8 +5,6 @@ from sklearn.utils.testing import assert_array_equal from sklearn.cluster import MiniBatchKMeans -from sklearn.datasets import make_classification - from imblearn.over_sampling import (KMeansSMOTE, SMOTE) @@ -25,6 +23,7 @@ def data(): y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) return X, y + def test_kmeans_smote(data): X, y = data kmeans_smote = KMeansSMOTE(kmeans_estimator=1, @@ -39,6 +38,7 @@ def test_kmeans_smote(data): assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) + def test_sample_kmeans_custom(data): X, y = data smote = KMeansSMOTE(random_state=42, @@ -67,6 +67,7 @@ def test_sample_kmeans_custom(data): assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) + def test_sample_kmeans_not_enough_clusters(data): np.random.seed(42) X = np.random.random((30, 2)) From 6129fbf916f59940a8d68ccf5072c14f38fad49c Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Sun, 10 Mar 2019 14:01:33 +0100 Subject: [PATCH 23/35] Fixed using the wrong variable name --- imblearn/over_sampling/_smote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index a364405a1..3a1bcae58 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1229,7 +1229,7 @@ def _sample(self, X, y): continue anticipated_samples = cluster_class_mean * X_cluster.shape[0] - if total_inp_samples < self.nn_k_.n_neighbors: + if anticipated_samples < self.nn_k_.n_neighbors: continue X_cluster_class = safe_indexing( From 9537ec9e488a420f23c0070560adf903654700d9 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 1 Apr 2019 14:18:43 +0200 Subject: [PATCH 24/35] Fixed error in _make_samples call, resolves mediocre sample selection. --- imblearn/over_sampling/_smote.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 3a1bcae58..b32d1952d 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1176,7 +1176,6 @@ def _validate_estimator(self): def _find_cluster_sparsity(self, X): euclidean_distances = pairwise_distances(X, metric="euclidean", n_jobs=self.n_jobs) - # negate diagonal elements for ind in range(X.shape[0]): euclidean_distances[ind, ind] = 0 @@ -1265,7 +1264,7 @@ def _sample(self, X, y): X_new, y_new = self._make_samples(X_cluster_class, y.dtype, class_sample, - X_class, + X_cluster_class, nns, cluster_n_samples, 1.0) From b6fbca4ea8db21daa4a882a4d4cc99045da6cfa5 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 1 Apr 2019 14:34:46 +0200 Subject: [PATCH 25/35] Updated KMeansSMOTE tests --- imblearn/over_sampling/tests/test_kmeans_smote.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 1a1050979..8e43f63f2 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -45,6 +45,7 @@ def test_sample_kmeans_custom(data): kmeans_estimator=KMeans(n_clusters=3, random_state=42), k_neighbors=2) X_resampled, y_resampled = smote.fit_sample(X, y) + print(X_resampled.tolist()) X_gt = np.array([ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], @@ -56,8 +57,8 @@ def test_sample_kmeans_custom(data): [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.19141841, -0.82923193], [0.687674179, -0.3327227441], - [1.24349671, -0.87451605], [0.3042074282, -0.093428711] + [1.34544799, -0.85882949], [0.65524146, -0.40037012], + [1.30946198, -0.88566536], [1.28791949, -0.25969361] ]) y_gt = np.array([ @@ -86,6 +87,7 @@ def test_sample_kmeans(data): kmeans_estimator=3, k_neighbors=2) X_resampled, y_resampled = smote.fit_sample(X, y) + print(X_resampled.tolist()) X_gt = np.array([ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], @@ -97,8 +99,8 @@ def test_sample_kmeans(data): [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.19141841, -0.82923193], [0.687674179, -0.3327227441], - [1.24349671, -0.87451605], [0.3042074282, -0.093428711] + [1.34544799, -0.85882949], [0.65524146, -0.40037012], + [1.30946198, -0.88566536], [1.28791949, -0.25969361] ]) y_gt = np.array([ @@ -116,6 +118,7 @@ def test_sample_kmeans_density_estimation(data): k_neighbors=2, density_exponent=2) X_resampled, y_resampled = smote.fit_sample(X, y) + X_gt = np.array([ [0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], @@ -127,8 +130,8 @@ def test_sample_kmeans_density_estimation(data): [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.19141841, -0.82923193], [0.687674179, -0.3327227441], - [1.24349671, -0.87451605], [0.3042074282, -0.093428711] + [1.34544799, -0.85882949], [0.65524146, -0.40037012], + [1.30946198, -0.88566536], [1.28791949, -0.25969361] ]) y_gt = np.array([ From ca9b541123b195194936d0ae8476aa2bad586986 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 1 Apr 2019 16:53:05 +0200 Subject: [PATCH 26/35] Clarified RuntimeError with solution to problem --- imblearn/over_sampling/_smote.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index b32d1952d..c2547c685 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1244,7 +1244,9 @@ def _sample(self, X, y): if not valid_clusters: raise RuntimeError( "No clusters found with sufficient samples of " - "class {}.".format(class_sample)) + "class {}. Try lowering the cluster_balance_threshold or " + "or increasing the number of " + "clusters.".format(class_sample)) for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): X_cluster = safe_indexing(X, valid_cluster) From 1b4dfd2ca70ed14a472172c14bd239e68349506b Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 6 May 2019 10:21:54 +0200 Subject: [PATCH 27/35] Adjusted documentation according to @chkoar's review. --- imblearn/over_sampling/_smote.py | 35 +++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index c2547c685..55bac1fc3 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1130,13 +1130,35 @@ class KMeansSMOTE(BaseSMOTE): manually. density_exponent: str or float, optional (default="auto") - This exponent is used to determine the density of a cluster. Leaving it - to 'auto' will use a feature-length based exponent. + This exponent is used to determine the density of a cluster. Leaving + this to 'auto' will use a feature-length based exponent. - ratio : str, dict, or callable - .. deprecated:: 0.4 - Use the parameter ``sampling_strategy`` instead. It will be removed - in 0.6. + References + ---------- + .. [3] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for + Imbalanced Learning Based on K-Means and SMOTE" + https://arxiv.org/abs/1711.00837 + + Examples + -------- + + >>> from imblearn.over_sampling import KMeansSMOTE + >>> from sklearn.datasets import make_blobs + >>> blob_sizes = [100, 800, 100] + >>> X, y = make_blobs(blob_sizes, + ... centers=[(-10, 0), (0,0), (10, 0)]) + >>> # Make this a binary classification problem + >>> y = y == 1 + >>> sm = KMeansSMOTE(random_state=42) + >>> X_res, y_res = sm.fit_resample(X, y) + >>> # Find the number of new samples in the middle blob + >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum() + >>> print("Samples in the middle blob: %s" % n_res_in_middle) + Samples in the middle blob: 800 + >>> print("Same as middle blob: %s" % (n_res_in_middle == blob_sizes[1])) + Same samples in middle blob: True + >>> print("More 0 samples: %s " % ((y_res == 0).sum() > (y == 0).sum())) + More 0 samples: True """ def __init__(self, @@ -1145,7 +1167,6 @@ def __init__(self, k_neighbors=2, n_jobs=1, kmeans_estimator=None, - ratio=None, cluster_balance_threshold="auto", density_exponent="auto"): From 367f3a089fe1240f3d639baba17569c0ed7f27b0 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 6 May 2019 10:54:34 +0200 Subject: [PATCH 28/35] Slightly adjusted test to 'fail' for regular SMOTE. --- imblearn/over_sampling/_smote.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 55bac1fc3..725553c3e 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1142,11 +1142,15 @@ class KMeansSMOTE(BaseSMOTE): Examples -------- + >>> import numpy as np >>> from imblearn.over_sampling import KMeansSMOTE >>> from sklearn.datasets import make_blobs - >>> blob_sizes = [100, 800, 100] - >>> X, y = make_blobs(blob_sizes, + >>> blobs = [100, 800, 100] + >>> X, y = make_blobs(blobs, ... centers=[(-10, 0), (0,0), (10, 0)]) + >>> # Add a single 0 sample in the middle blob + >>> X = np.concatenate([X, [[0, 0]]]) + >>> y = np.append(y, 0) >>> # Make this a binary classification problem >>> y = y == 1 >>> sm = KMeansSMOTE(random_state=42) @@ -1155,8 +1159,8 @@ class KMeansSMOTE(BaseSMOTE): >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum() >>> print("Samples in the middle blob: %s" % n_res_in_middle) Samples in the middle blob: 800 - >>> print("Same as middle blob: %s" % (n_res_in_middle == blob_sizes[1])) - Same samples in middle blob: True + >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1)) + Middle blob unchanged: True >>> print("More 0 samples: %s " % ((y_res == 0).sum() > (y == 0).sum())) More 0 samples: True From 4a414c36e45fe3255ca67c8159198d04d7cf5459 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 6 May 2019 11:19:37 +0200 Subject: [PATCH 29/35] Fix expected print output --- imblearn/over_sampling/_smote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 725553c3e..2501c42ae 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1158,7 +1158,7 @@ class KMeansSMOTE(BaseSMOTE): >>> # Find the number of new samples in the middle blob >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum() >>> print("Samples in the middle blob: %s" % n_res_in_middle) - Samples in the middle blob: 800 + Samples in the middle blob: 801 >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1)) Middle blob unchanged: True >>> print("More 0 samples: %s " % ((y_res == 0).sum() > (y == 0).sum())) From d9fa137f4d477eee7a54b5fc0a6aec54f296d720 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 6 May 2019 11:21:15 +0200 Subject: [PATCH 30/35] Added ratio back to pass check_samplers_ratio_fit_resample test --- imblearn/over_sampling/_smote.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 2501c42ae..bdc71cb95 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1172,7 +1172,8 @@ def __init__(self, n_jobs=1, kmeans_estimator=None, cluster_balance_threshold="auto", - density_exponent="auto"): + density_exponent="auto", + ratio=None): super(KMeansSMOTE, self).__init__( sampling_strategy=sampling_strategy, random_state=random_state, From cf1b1fe2c1e6f07146e1a330cac0718a8bd9a5a8 Mon Sep 17 00:00:00 2001 From: Stephan Heijl Date: Mon, 6 May 2019 11:30:30 +0200 Subject: [PATCH 31/35] Added KMeansSMOTE to DONT_SUPPORT_RATIO and removed space from print --- imblearn/over_sampling/_smote.py | 5 ++--- imblearn/utils/estimator_checks.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index bdc71cb95..63614ff82 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -1161,7 +1161,7 @@ class KMeansSMOTE(BaseSMOTE): Samples in the middle blob: 801 >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1)) Middle blob unchanged: True - >>> print("More 0 samples: %s " % ((y_res == 0).sum() > (y == 0).sum())) + >>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum())) More 0 samples: True """ @@ -1172,8 +1172,7 @@ def __init__(self, n_jobs=1, kmeans_estimator=None, cluster_balance_threshold="auto", - density_exponent="auto", - ratio=None): + density_exponent="auto"): super(KMeansSMOTE, self).__init__( sampling_strategy=sampling_strategy, random_state=random_state, diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 13b14affd..30f27c2b9 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -33,7 +33,7 @@ from imblearn.over_sampling import SMOTE from imblearn.under_sampling import NearMiss, ClusterCentroids -DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] +DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE', 'KMeansSMOTE'] SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] HAVE_SAMPLE_INDICES = [ 'RandomOverSampler', 'RandomUnderSampler', 'InstanceHardnessThreshold', From 0c4dd16aaaa28995a78396362eb50100e8fa7092 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 12 Jun 2019 14:16:37 +0200 Subject: [PATCH 32/35] cleaning --- .../plot_comparison_over_sampling.py | 4 +- imblearn/over_sampling/_smote.py | 83 +++++++---- .../over_sampling/tests/test_kmeans_smote.py | 140 +++++++----------- 3 files changed, 108 insertions(+), 119 deletions(-) diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index ced532ec7..003e4e2c6 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -205,7 +205,9 @@ def _fit_resample(self, X, y): # SMOTE proposes several variants by identifying specific samples to consider # during the resampling. The borderline version will detect which point to # select which are in the border between two classes. The SVM version will use -# the support vectors found using an SVM algorithm to create new samples. +# the support vectors found using an SVM algorithm to create new sample while +# the KMeans version will make a clustering before to generate samples in each +# cluster independently depending each cluster density. fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 8852d535d..a7109ab48 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -17,7 +17,7 @@ from scipy import sparse from sklearn.base import clone -from sklearn.cluster import MiniBatchKMeans as KMeans +from sklearn.cluster import MiniBatchKMeans from sklearn.metrics import pairwise_distances from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC @@ -1120,22 +1120,35 @@ class KMeansSMOTE(BaseSMOTE): n_jobs : int, optional (default=1) The number of threads to open if possible. - kmeans_estimator : int or object, optional (default=KMeans()) - A KMeans instance or the number of clusters to be used. + kmeans_estimator : int or object, optional (default=MiniBatchKMeans()) + A KMeans instance or the number of clusters to be used. By default, + we used a :class:`sklearn.cluster.MiniBatchKMeans` which tend to be + better with large number of samples. - cluster_balance_threshold: str or float, optional (default="auto") + cluster_balance_threshold : str or float, optional (default="auto") The threshold at which a cluster is called balanced and where samples of the class selected for SMOTE will be oversampled. If "auto", this will be determined by the ratio for each class, or it can be set manually. - density_exponent: str or float, optional (default="auto") + density_exponent : str or float, optional (default="auto") This exponent is used to determine the density of a cluster. Leaving - this to 'auto' will use a feature-length based exponent. + this to "auto" will use a feature-length based exponent. + + Attributes + ---------- + kmeans_estimator_ : estimator + The fitted clustering method used before to apply SMOTE. + + nn_k_ : estimator + The fitted k-NN estimator used in SMOTE. + + cluster_balance_threshold_ : float + The threshold used during ``fit`` for calling a cluster balanced. References ---------- - .. [3] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for + .. [1] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for Imbalanced Learning Based on K-Means and SMOTE" https://arxiv.org/abs/1711.00837 @@ -1146,8 +1159,7 @@ class KMeansSMOTE(BaseSMOTE): >>> from imblearn.over_sampling import KMeansSMOTE >>> from sklearn.datasets import make_blobs >>> blobs = [100, 800, 100] - >>> X, y = make_blobs(blobs, - ... centers=[(-10, 0), (0,0), (10, 0)]) + >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)]) >>> # Add a single 0 sample in the middle blob >>> X = np.concatenate([X, [[0, 0]]]) >>> y = np.append(y, 0) @@ -1173,32 +1185,42 @@ def __init__(self, kmeans_estimator=None, cluster_balance_threshold="auto", density_exponent="auto"): - - super(KMeansSMOTE, self).__init__( + super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs) - self.kmeans_estimator = kmeans_estimator self.cluster_balance_threshold = cluster_balance_threshold self.density_exponent = density_exponent def _validate_estimator(self): - super(KMeansSMOTE, self)._validate_estimator() + super()._validate_estimator() if self.kmeans_estimator is None: - self.kmeans_estimator_ = KMeans( + self.kmeans_estimator_ = MiniBatchKMeans( random_state=self.random_state) elif isinstance(self.kmeans_estimator, int): - self.kmeans_estimator_ = KMeans( + self.kmeans_estimator_ = MiniBatchKMeans( n_clusters=self.kmeans_estimator, random_state=self.random_state) else: self.kmeans_estimator_ = clone(self.kmeans_estimator) + # validate the parameters + for param_name in ('cluster_balance_threshold', 'density_exponent'): + param = getattr(self, param_name) + if isinstance(param, str) and param != 'auto': + raise ValueError( + "'{}' should be 'auto' when a string is passed. " + "Got {} instead.".format(param_name, repr(param)) + ) + self.cluster_balance_threshold_ = ( self.cluster_balance_threshold - if self.kmeans_estimator_.n_clusters != 1 else -np.inf) + if self.kmeans_estimator_.n_clusters != 1 else -np.inf + ) + def _find_cluster_sparsity(self, X): + """Compute the cluster sparsity.""" euclidean_distances = pairwise_distances(X, metric="euclidean", n_jobs=self.n_jobs) # negate diagonal elements @@ -1218,7 +1240,6 @@ def _fit_resample(self, X, y): def _sample(self, X, y): self._validate_estimator() - check_random_state(self.random_state) X_resampled = X.copy() y_resampled = y.copy() total_inp_samples = sum(self.sampling_strategy_.values()) @@ -1227,41 +1248,44 @@ def _sample(self, X, y): if n_samples == 0: continue - target_class_indices = np.flatnonzero(y == class_sample) - X_class = safe_indexing(X, target_class_indices) + # target_class_indices = np.flatnonzero(y == class_sample) + # X_class = safe_indexing(X, target_class_indices) X_clusters = self.kmeans_estimator_.fit_predict(X) - valid_clusters = [] cluster_sparsities = [] - # Identify valid clusters + # identify cluster which are answering the requirements for cluster_idx in range(self.kmeans_estimator_.n_clusters): - cluster_mask = np.flatnonzero(X_clusters == cluster_idx) + cluster_mask = np.flatnonzero(X_clusters == cluster_idx) X_cluster = safe_indexing(X, cluster_mask) y_cluster = safe_indexing(y, cluster_mask) cluster_class_mean = (y_cluster == class_sample).mean() - if self.cluster_balance_threshold == "auto": + if self.cluster_balance_threshold_ == "auto": balance_threshold = n_samples / total_inp_samples / 2 else: - balance_threshold = self.cluster_balance_threshold + balance_threshold = self.cluster_balance_threshold_ + # the cluster is already considered balanced if cluster_class_mean < balance_threshold: continue + # not enough samples to apply SMOTE anticipated_samples = cluster_class_mean * X_cluster.shape[0] if anticipated_samples < self.nn_k_.n_neighbors: continue X_cluster_class = safe_indexing( - X_cluster, np.flatnonzero(y_cluster == class_sample)) + X_cluster, np.flatnonzero(y_cluster == class_sample) + ) valid_clusters.append(cluster_mask) cluster_sparsities.append( - self._find_cluster_sparsity(X_cluster_class)) + self._find_cluster_sparsity(X_cluster_class) + ) cluster_sparsities = np.array(cluster_sparsities) cluster_weights = cluster_sparsities / cluster_sparsities.sum() @@ -1278,15 +1302,16 @@ def _sample(self, X, y): y_cluster = safe_indexing(y, valid_cluster) X_cluster_class = safe_indexing( - X_cluster, np.flatnonzero(y_cluster == class_sample)) + X_cluster, np.flatnonzero(y_cluster == class_sample) + ) self.nn_k_.fit(X_cluster_class) - nns = self.nn_k_.kneighbors(X_cluster_class, return_distance=False)[:, 1:] cluster_n_samples = int(math.ceil( - n_samples * cluster_weights[valid_cluster_idx])) + n_samples * cluster_weights[valid_cluster_idx]) + ) X_new, y_new = self._make_samples(X_cluster_class, y.dtype, diff --git a/imblearn/over_sampling/tests/test_kmeans_smote.py b/imblearn/over_sampling/tests/test_kmeans_smote.py index 8e43f63f2..9bb9e9a62 100644 --- a/imblearn/over_sampling/tests/test_kmeans_smote.py +++ b/imblearn/over_sampling/tests/test_kmeans_smote.py @@ -3,9 +3,13 @@ from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal + from sklearn.cluster import KMeans +from sklearn.cluster import MiniBatchKMeans +from sklearn.neighbors import NearestNeighbors -from imblearn.over_sampling import (KMeansSMOTE, SMOTE) +from imblearn.over_sampling import KMeansSMOTE +from imblearn.over_sampling import SMOTE @pytest.fixture @@ -38,40 +42,33 @@ def test_kmeans_smote(data): assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) + assert kmeans_smote.nn_k_.n_neighbors == 6 + assert kmeans_smote.kmeans_estimator_.n_clusters == 1 + assert 'batch_size' in kmeans_smote.kmeans_estimator_.get_params() + -def test_sample_kmeans_custom(data): +@pytest.mark.parametrize("k_neighbors", [2, NearestNeighbors(n_neighbors=3)]) +@pytest.mark.parametrize( + "kmeans_estimator", + [3, + KMeans(n_clusters=3, random_state=42), + MiniBatchKMeans(n_clusters=3, random_state=42)] +) +def test_sample_kmeans_custom(data, k_neighbors, kmeans_estimator): X, y = data - smote = KMeansSMOTE(random_state=42, - kmeans_estimator=KMeans(n_clusters=3, random_state=42), - k_neighbors=2) - X_resampled, y_resampled = smote.fit_sample(X, y) - print(X_resampled.tolist()) - X_gt = np.array([ - [0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.34544799, -0.85882949], [0.65524146, -0.40037012], - [1.30946198, -0.88566536], [1.28791949, -0.25969361] - ]) - - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) - - assert_allclose(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - - -def test_sample_kmeans_not_enough_clusters(data): - np.random.seed(42) - X = np.random.random((30, 2)) + kmeans_smote = KMeansSMOTE(random_state=42, + kmeans_estimator=kmeans_estimator, + k_neighbors=k_neighbors) + X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) + assert X_resampled.shape == (24, 2) + assert y_resampled.shape == (24,) + + assert kmeans_smote.nn_k_.n_neighbors == 3 + assert kmeans_smote.kmeans_estimator_.n_clusters == 3 + +def test_sample_kmeans_not_enough_clusters(): + rng = np.random.RandomState(42) + X = rng.randn(30, 2) y = np.array([1] * 20 + [0] * 10) smote = KMeansSMOTE(random_state=42, @@ -81,62 +78,27 @@ def test_sample_kmeans_not_enough_clusters(data): smote.fit_sample(X, y) -def test_sample_kmeans(data): +@pytest.mark.parametrize("density_exponent", ["auto", 2]) +@pytest.mark.parametrize("cluster_balance_threshold", ["auto", 0.8]) +def test_sample_kmeans_density_estimation(data, density_exponent, + cluster_balance_threshold): X, y = data smote = KMeansSMOTE(random_state=42, - kmeans_estimator=3, - k_neighbors=2) - X_resampled, y_resampled = smote.fit_sample(X, y) - print(X_resampled.tolist()) - X_gt = np.array([ - [0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.34544799, -0.85882949], [0.65524146, -0.40037012], - [1.30946198, -0.88566536], [1.28791949, -0.25969361] - ]) - - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) - - assert_allclose(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) - - -def test_sample_kmeans_density_estimation(data): + density_exponent=density_exponent, + cluster_balance_threshold=cluster_balance_threshold) + smote.fit_sample(X, y) + + +@pytest.mark.parametrize( + "density_exponent, cluster_balance_threshold", + [('xxx', 'auto'), ('auto', 'xxx')] +) +def test_kmeans_smote_param_error(data, density_exponent, + cluster_balance_threshold): X, y = data - smote = KMeansSMOTE(random_state=42, - kmeans_estimator=3, - k_neighbors=2, - density_exponent=2) - X_resampled, y_resampled = smote.fit_sample(X, y) - - X_gt = np.array([ - [0.11622591, -0.0317206], [0.77481731, 0.60935141], - [1.25192108, -0.22367336], [0.53366841, -0.30312976], - [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.3084254, 0.33299982], - [0.70472253, -0.73309052], [0.28893132, -0.38761769], - [1.15514042, 0.0129463], [0.88407872, 0.35454207], - [1.31301027, -0.92648734], [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], [1.70580611, -0.11219234], - [1.34544799, -0.85882949], [0.65524146, -0.40037012], - [1.30946198, -0.88566536], [1.28791949, -0.25969361] - ]) - - y_gt = np.array([ - 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 - ]) - - assert_allclose(X_resampled, X_gt) - assert_array_equal(y_resampled, y_gt) + kmeans_smote = KMeansSMOTE( + density_exponent=density_exponent, + cluster_balance_threshold=cluster_balance_threshold + ) + with pytest.raises(ValueError, match="should be 'auto' when a string"): + kmeans_smote.fit_resample(X, y) From f4ec98028a977b10bbf661a40b6e5654bff793c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 12 Jun 2019 14:23:38 +0200 Subject: [PATCH 33/35] DOC: add an entry in documentation --- doc/over_sampling.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 2d78825cb..6159e925b 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -152,8 +152,8 @@ nearest neighbors class. Those variants are presented in the figure below. :align: center -The :class:`BorderlineSMOTE` [HWB2005]_ and :class:`SVMSMOTE` [NCK2009]_ offer -some variant of the SMOTE algorithm:: +The :class:`BorderlineSMOTE` [HWB2005]_, :class:`SVMSMOTE` [NCK2009]_, and +:class:`KMeansSMOTE` [LDB2017]_ offer some variant of the SMOTE algorithm:: >>> from imblearn.over_sampling import BorderlineSMOTE >>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y) @@ -209,6 +209,10 @@ other extra interpolation. Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2009. + .. [LDB2017] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for + Imbalanced Learning Based on K-Means and SMOTE" + https://arxiv.org/abs/1711.00837 + Mathematical formulation ======================== @@ -266,6 +270,10 @@ parameter of the SVM classifier allows to select more or less support vectors. For both borderline and SVM SMOTE, a neighborhood is defined using the parameter ``m_neighbors`` to decide if a sample is in danger, safe, or noise. +**KMeans** SMOTE --- cf. to :class:`KMeansSMOTE` --- uses a KMeans clustering +method before to apply SMOTE. The clustering will group samples together and +generate new samples depending of the cluster density. + ADASYN works similarly to the regular SMOTE. However, the number of samples generated for each :math:`x_i` is proportional to the number of samples which are not from the same class than :math:`x_i` in a given From c3a15020db572c9b7fc1db43b0bcc0429bd4bf6e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 12 Jun 2019 14:24:29 +0200 Subject: [PATCH 34/35] DOC: add entry in API documentation --- doc/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/api.rst b/doc/api.rst index 750c402f8..e98dfe47b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -71,6 +71,7 @@ Prototype selection over_sampling.ADASYN over_sampling.BorderlineSMOTE + over_sampling.KMeansSMOTE over_sampling.RandomOverSampler over_sampling.SMOTE over_sampling.SMOTENC From 032842ef0300b73b2a4523c1160976c30936ccf9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 12 Jun 2019 15:19:57 +0200 Subject: [PATCH 35/35] DOC: add whats new entry --- doc/whats_new/v0.5.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.5.rst b/doc/whats_new/v0.5.rst index 6ff15db79..a1a20baa6 100644 --- a/doc/whats_new/v0.5.rst +++ b/doc/whats_new/v0.5.rst @@ -37,6 +37,10 @@ Enhancement and issue template showing how to print system and dependency information from the command line. :pr:`557` by :user:`Alexander L. Hayes `. +- Add :class:`imblearn.over_sampling.KMeansSMOTE` which is an over-sampler + clustering points before to apply SMOTE. + :pr:`435` by :user:`Stephan Heijl `. + Maintenance ...........