fixed variable name, added doc and test

laurallu · laurallu · commit 609c4fc4fec5 · 2019-11-11T13:53:09.000-05:00
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -152,8 +152,9 @@ nearest neighbors class. Those variants are presented in the figure below.
    :align: center
 
 
-The :class:`BorderlineSMOTE` [HWB2005]_, :class:`SVMSMOTE` [NCK2009]_, and
-:class:`KMeansSMOTE` [LDB2017]_ offer some variant of the SMOTE algorithm::
+The :class:`BorderlineSMOTE` [HWB2005]_, :class:`SVMSMOTE` [NCK2009]_,
+:class:`KMeansSMOTE` [LDB2017]_ and :class:`SafeLevelSMOTE` [BSL2009]_
+offer some variant of the SMOTE algorithm::
 
   >>> from imblearn.over_sampling import BorderlineSMOTE
   >>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)
@@ -213,6 +214,14 @@ other extra interpolation.
                Imbalanced Learning Based on K-Means and SMOTE"
                https://arxiv.org/abs/1711.00837
 
+     [BSL2009] C. Bunkhumpornpat, K. Sinapiromsaran, C. Lursinsap,
+               "Safe-level-SMOTE: Safe-level-synthetic minority over-sampling
+               technique for handling the class imbalanced problem," In:
+               Theeramunkong T., Kijsirikul B., Cercone N., Ho TB. (eds)
+               Advances in Knowledge Discovery and Data Mining. PAKDD 2009.
+               Lecture Notes in Computer Science, vol 5476. Springer, Berlin,
+               Heidelberg, 475-482, 2009.
+
 Mathematical formulation
 ========================
 
@@ -274,6 +283,11 @@ parameter ``m_neighbors`` to decide if a sample is in danger, safe, or noise.
 method before to apply SMOTE. The clustering will group samples together and
 generate new samples depending of the cluster density.
 
+**SafeLevel** SMOTE --- cf. to :class:`SafeLevelSMOTE` --- uses the safe level
+(the number of positive instances in nearest neighbors) to generate a synthetic
+instance. Compared to regular SMOTE, the new instance is positioned closer to
+the positive instance with larger safe level.
+
 ADASYN works similarly to the regular SMOTE. However, the number of
 samples generated for each :math:`x_i` is proportional to the number of samples
 which are not from the same class than :math:`x_i` in a given
diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py
@@ -10,7 +10,7 @@
 from ._smote import KMeansSMOTE
 from ._smote import SVMSMOTE
 from ._smote import SMOTENC
-from ._smote import SLSMOTE
+from ._smote import SafeLevelSMOTE
 
 __all__ = [
     "ADASYN",
@@ -20,5 +20,5 @@
     "BorderlineSMOTE",
     "SVMSMOTE",
     "SMOTENC",
-    "SLSMOTE",
+    "SafeLevelSMOTE",
 ]
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
@@ -284,6 +284,11 @@ class BorderlineSMOTE(BaseSMOTE):
 
     SVMSMOTE : Over-sample using SVM-SMOTE variant.
 
+    KMeansSMOTE: Over-sample using KMeans-SMOTE variant.
+
+    SafeLevelSMOTE: Over-sample using SafeLevel-SMOTE variant.
+
+
     ADASYN : Over-sample using ADASYN.
 
     References
@@ -484,6 +489,10 @@ class SVMSMOTE(BaseSMOTE):
 
     BorderlineSMOTE : Over-sample using Borderline-SMOTE.
 
+    KMeansSMOTE: Over-sample using KMeans-SMOTE variant.
+
+    SafeLevelSMOTE: Over-sample using SafeLevel-SMOTE variant.
+
     ADASYN : Over-sample using ADASYN.
 
     References
@@ -695,6 +704,10 @@ class SMOTE(BaseSMOTE):
 
     SVMSMOTE : Over-sample using the SVM-SMOTE variant.
 
+    KMeansSMOTE: Over-sample using KMeans-SMOTE variant.
+
+    SafeLevelSMOTE: Over-sample using SafeLevel-SMOTE variant.
+
     ADASYN : Over-sample using ADASYN.
 
     References
@@ -864,6 +877,10 @@ class SMOTENC(SMOTE):
 
     BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.
 
+    KMeansSMOTE: Over-sample using KMeans-SMOTE variant.
+
+    SafeLevelSMOTE: Over-sample using SafeLevel-SMOTE variant.
+
     ADASYN : Over-sample using ADASYN.
 
     References
@@ -1318,7 +1335,7 @@ def _fit_resample(self, X, y):
     sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring,
 )
-class SLSMOTE(BaseSMOTE):
+class SafeLevelSMOTE(BaseSMOTE):
     """Class to perform over-sampling using safe-level SMOTE.
     This is an implementation of the Safe-level-SMOTE described in [2]_.
 
@@ -1389,13 +1406,13 @@ class SLSMOTE(BaseSMOTE):
     >>> from collections import Counter
     >>> from sklearn.datasets import make_classification
     >>> from imblearn.over_sampling import \
-SLSMOTE # doctest: +NORMALIZE_WHITESPACE
+SafeLevelSMOTE # doctest: +NORMALIZE_WHITESPACE
     >>> X, y = make_classification(n_classes=2, class_sep=2,
     ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
     ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
     >>> print('Original dataset shape %s' % Counter(y))
     Original dataset shape Counter({{1: 900, 0: 100}})
-    >>> sm = SLSMOTE(random_state=42)
+    >>> sm = SafeLevelSMOTE(random_state=42)
     >>> X_res, y_res = sm.fit_resample(X, y)
     >>> print('Resampled dataset shape %s' % Counter(y_res))
     Resampled dataset shape Counter({{0: 900, 1: 900}})
@@ -1415,7 +1432,7 @@ def __init__(self,
 
         self.m_neighbors = m_neighbors
 
-    def _assign_sl(self, nn_estimator, samples, target_class, y):
+    def _assign_safe_levels(self, nn_estimator, samples, target_class, y):
         '''
         Assign the safe levels to the instances in the target class.
 
@@ -1444,8 +1461,8 @@ def _assign_sl(self, nn_estimator, samples, target_class, y):
 
         x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:]
         nn_label = (y[x] == target_class).astype(int)
-        sl = np.sum(nn_label, axis=1)
-        return sl
+        safe_levels = np.sum(nn_label, axis=1)
+        return safe_levels
 
     def _validate_estimator(self):
         super()._validate_estimator()
@@ -1466,28 +1483,30 @@ def _fit_resample(self, X, y):
             X_class = _safe_indexing(X, target_class_indices)
 
             self.nn_m_.fit(X)
-            sl = self._assign_sl(self.nn_m_, X_class, class_sample, y)
+            safe_levels = self._assign_safe_levels(
+                self.nn_m_, X_class, class_sample, y)
 
             # filter the points in X_class that have safe level >0
             # If safe level = 0, the point is not used to
             # generate synthetic instances
-            X_safe_indices = np.flatnonzero(sl != 0)
+            X_safe_indices = np.flatnonzero(safe_levels != 0)
             X_safe_class = _safe_indexing(X_class, X_safe_indices)
 
             self.nn_k_.fit(X_class)
             nns = self.nn_k_.kneighbors(X_safe_class,
                                         return_distance=False)[:, 1:]
 
-            sl_safe_class = sl[X_safe_indices]
-            sl_nns = sl[nns]
+            sl_safe_class = safe_levels[X_safe_indices]
+            sl_nns = safe_levels[nns]
             sl_safe_t = np.array([sl_safe_class]).transpose()
             with np.errstate(divide='ignore'):
-                sl_ratio = np.divide(sl_safe_t, sl_nns)
+                safe_level_ratio = np.divide(sl_safe_t, sl_nns)
 
-            X_new, y_new = self._make_samples_sl(X_safe_class, y.dtype,
-                                                 class_sample, X_class,
-                                                 nns, n_samples, sl_ratio,
-                                                 1.0)
+            X_new, y_new = self._make_samples_safelevel(X_safe_class, y.dtype,
+                                                        class_sample, X_class,
+                                                        nns, n_samples,
+                                                        safe_level_ratio,
+                                                        1.0)
 
             if sparse.issparse(X_new):
                 X_resampled = sparse.vstack([X_resampled, X_new])
@@ -1497,8 +1516,8 @@ def _fit_resample(self, X, y):
 
         return X_resampled, y_resampled
 
-    def _make_samples_sl(self, X, y_dtype, y_type, nn_data, nn_num,
-                         n_samples, sl_ratio, step_size=1.):
+    def _make_samples_safelevel(self, X, y_dtype, y_type, nn_data, nn_num,
+                                n_samples, safe_level_ratio, step_size=1.):
         """A support function that returns artificial samples using
         safe-level SMOTE. It is similar to _make_samples method for SMOTE.
 
@@ -1524,7 +1543,7 @@ def _make_samples_sl(self, X, y_dtype, y_type, nn_data, nn_num,
         n_samples : int
             The number of samples to generate.
 
-        sl_ratio: ndarray, shape (n_samples_safe, k_nearest_neighbours)
+        safe_level_ratio: ndarray, shape (n_samples_safe, k_nearest_neighbours)
 
         step_size : float, optional (default=1.)
             The step size to create samples.
@@ -1546,8 +1565,8 @@ def _make_samples_sl(self, X, y_dtype, y_type, nn_data, nn_num,
                                                size=n_samples)
         rows = np.floor_divide(samples_indices, nn_num.shape[1])
         cols = np.mod(samples_indices, nn_num.shape[1])
-        gap_arr = step_size * self._vgenerate_gap(sl_ratio)
-        gaps = gap_arr.flatten()[samples_indices]
+        gap_array = step_size * self._vgenerate_gap(safe_level_ratio)
+        gaps = gap_array.flatten()[samples_indices]
 
         y_new = np.array([y_type] * n_samples, dtype=y_dtype)
 
@@ -1578,12 +1597,12 @@ def _make_samples_sl(self, X, y_dtype, y_type, nn_data, nn_num,
         return X_new, y_new
 
     def _generate_gap(self, a_ratio, rand_state=None):
-        """ generate gap according to sl_ratio, non-vectorized version.
+        """ generate gap according to safe_level_ratio, non-vectorized version.
 
         Parameters
         ----------
         a_ratio: float
-                 sl_ratio of a single data point
+                 safe_level_ratio of a single data point
 
         rand_state: random state object or int
 
@@ -1603,28 +1622,30 @@ def _generate_gap(self, a_ratio, rand_state=None):
         elif 0 < a_ratio < 1:
             gap = random_state.uniform(1-a_ratio, 1)
         else:
-            raise ValueError('sl_ratio should be nonegative')
+            raise ValueError('safe_level_ratio should be nonegative')
         return gap
 
-    def _vgenerate_gap(self, sl_ratio):
+    def _vgenerate_gap(self, safe_level_ratio):
         """
-        generate gap according to sl_ratio, vectorized version of _generate_gap
+        generate gap according to safe_level_ratio, vectorized version
+        of _generate_gap
 
         Parameters
         -----------
-        sl_ratio: ndarray  shape (n_samples_safe, k_nearest_neighbours)
-                  sl_ratio of all instances with safe_level>0 in the specified
-                  class
+        safe_level_ratio: ndarray  shape (n_samples_safe, k_nearest_neighbours)
+                  safe_level_ratio of all instances with safe_level>0 in the
+                  specified class
 
         Returns
         ------------
-        gap_arr: ndarray  shape (n_samples_safe, k_nearest_neighbours)
+        gap_array: ndarray  shape (n_samples_safe, k_nearest_neighbours)
                  the gap for all instances with safe_level>0 in the specified
                  class
 
         """
         prng = check_random_state(self.random_state)
-        rand_state = prng.randint(sl_ratio.size+1, size=sl_ratio.shape)
+        rand_state = prng.randint(
+            safe_level_ratio.size+1, size=safe_level_ratio.shape)
         vgap = np.vectorize(self._generate_gap)
-        gap_arr = vgap(sl_ratio, rand_state)
-        return gap_arr
+        gap_array = vgap(safe_level_ratio, rand_state)
+        return gap_array
diff --git a/imblearn/over_sampling/tests/test_safelevel_smote.py b/imblearn/over_sampling/tests/test_safelevel_smote.py
@@ -1,13 +1,14 @@
 import pytest
 import numpy as np
+from collections import Counter
 
 from sklearn.neighbors import NearestNeighbors
 from scipy import sparse
 
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_equal
 
-from imblearn.over_sampling import SLSMOTE
+from imblearn.over_sampling import SafeLevelSMOTE
 
 
 def data_np():
@@ -27,40 +28,53 @@ def data_sparse(format):
     "data",
     [data_np(), data_sparse('csr'), data_sparse('csc')]
 )
-def test_slsmote(data):
+def test_safelevel_smote(data):
     y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
                      0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])
     X, y = data
-    slsmote = SLSMOTE(random_state=42)
-    X_res, y_res = slsmote.fit_resample(X, y)
+    safelevel_smote = SafeLevelSMOTE(random_state=42)
+    X_res, y_res = safelevel_smote.fit_resample(X, y)
 
     assert X_res.shape == (24, 2)
     assert_array_equal(y_res, y_gt)
 
 
-def test_slsmote_nn():
+def test_sl_smote_nn():
     X, y = data_np()
-    slsmote = SLSMOTE(random_state=42)
-    slsmote_nn = SLSMOTE(
+    safelevel_smote = SafeLevelSMOTE(random_state=42)
+    safelevel_smote_nn = SafeLevelSMOTE(
         random_state=42,
         k_neighbors=NearestNeighbors(n_neighbors=6),
         m_neighbors=NearestNeighbors(n_neighbors=11),
     )
 
-    X_res_1, y_res_1 = slsmote.fit_resample(X, y)
-    X_res_2, y_res_2 = slsmote_nn.fit_resample(X, y)
+    X_res_1, y_res_1 = safelevel_smote.fit_resample(X, y)
+    X_res_2, y_res_2 = safelevel_smote_nn.fit_resample(X, y)
 
     assert_allclose(X_res_1, X_res_2)
     assert_array_equal(y_res_1, y_res_2)
 
 
-def test_slsmote_pd():
+def test_sl_smote_pd():
     pd = pytest.importorskip("pandas")
     X, y = data_np()
     X_pd = pd.DataFrame(X)
-    slsmote = SLSMOTE(random_state=42)
-    X_res, y_res = slsmote.fit_resample(X, y)
-    X_res_pd, y_res_pd = slsmote.fit_resample(X_pd, y)
+    safelevel_smote = SafeLevelSMOTE(random_state=42)
+    X_res, y_res = safelevel_smote.fit_resample(X, y)
+    X_res_pd, y_res_pd = safelevel_smote.fit_resample(X_pd, y)
 
     assert X_res_pd.tolist() == X_res.tolist()
     assert_allclose(y_res_pd, y_res)
+
+
+def test_sl_smote_multiclass():
+    rng = np.random.RandomState(42)
+    X = rng.randn(50, 2)
+    y = np.array([0] * 10 + [1] * 15 + [2] * 25)
+    safelevel_smote = SafeLevelSMOTE(random_state=42)
+    X_res, y_res = safelevel_smote.fit_resample(X, y)
+
+    count_y_res = Counter(y_res)
+    assert count_y_res[0] == 25
+    assert count_y_res[1] == 25
+    assert count_y_res[2] == 25