scikit-learn-contrib · glemaitre · Aug 3, 2017 · Aug 2, 2017 · Aug 3, 2017
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -12,48 +12,69 @@ Changelog
 Bug fixes
 ---------
 
+- Fixed a bug in :func:`utils.check_ratio` such that an error is raised when
+  the number of samples required is negative. By `Guillaume Lemaitre`_.
+
 - Fixed a bug in :class:`under_sampling.NearMiss` version 3. The
   indices returned were wrong. By `Guillaume Lemaitre`_.
-- fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN`
+
+- Fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN`
   and :class:`SMOTETomek. By `Guillaume Lemaitre`_.`
 
 New features
 ~~~~~~~~~~~~
 
 - Turn off steps in :class:`pipeline.Pipeline` using the `None`
   object. By `Christos Aridas`_.
+
 - Add a fetching function `datasets.fetch_datasets` in order to get some
   imbalanced datasets useful for benchmarking. By `Guillaume Lemaitre`_.
 
 Enhancement
 ~~~~~~~~~~~
 
+- :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It
+  supports multiclass. By `Guillaume Lemaitre`_.
+
 - All the unit tests have been factorized and a `check_estimators` has
   been derived from scikit-learn. By `Guillaume Lemaitre`_.
+
 - Script for automatic build of conda packages and uploading. By
   `Guillaume Lemaitre`_
+
 - Remove seaborn dependence and improve the examples. By `Guillaume
   Lemaitre`_.
+
 - adapt all classes to multi-class resampling. By `Guillaume Lemaitre`_
 
 API changes summary
 ~~~~~~~~~~~~~~~~~~~
 
 - `__init__` has been removed from the :class:`base.SamplerMixin` to
   create a real mixin class. By `Guillaume Lemaitre`_.
+
 - creation of a module `exceptions` to handle consistant raising of
   errors. By `Guillaume Lemaitre`_.
+
 - creation of a module `utils.validation` to make checking of
   recurrent patterns. By `Guillaume Lemaitre`_.
+
 - move the under-sampling methods in `prototype_selection` and
   `prototype_generation` submodule to make a clearer dinstinction. By
   `Guillaume Lemaitre`_.
+
 - change `ratio` such that it can adapt to multiple class problems. By
   `Guillaume Lemaitre`_.
 
 Deprecation
 ~~~~~~~~~~~
 
+- Deprecation of the use of ``min_c_`` in :func:`datasets.make_imbalance`. By
+  `Guillaume Lemaitre`_
+
+- Deprecation of the use of float in :func:`datasets.make_imbalance` for the
+  ratio parameter. By `Guillaume Lemaitre`_.
+
 - deprecate the use of float as ratio in favor of dictionary, string, or
   callable. By `Guillaume Lemaitre`_.
 

diff --git a/imblearn/base.py b/imblearn/base.py
@@ -11,7 +11,7 @@
 
 from sklearn.base import BaseEstimator
 from sklearn.externals import six
-from sklearn.utils import check_X_y, check_random_state
+from sklearn.utils import check_X_y
 from sklearn.utils.validation import check_is_fitted
 
 from .utils import check_ratio, check_target_type, hash_X_y

diff --git a/imblearn/datasets/imbalance.py b/imblearn/datasets/imbalance.py
@@ -7,10 +7,14 @@
 # License: MIT
 
 import logging
+import warnings
 from collections import Counter
+from numbers import Real
 
-import numpy as np
-from sklearn.utils import check_random_state, check_X_y
+from sklearn.utils import check_X_y
+
+from ..under_sampling.prototype_selection import RandomUnderSampler
+from ..utils import check_ratio
 
 LOGGER = logging.getLogger(__name__)
 
@@ -28,14 +32,23 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None):
     y : ndarray, shape (n_samples, )
         Corresponding label for each sample in X.
 
-    ratio : float,
-        The desired ratio given by the number of samples in
-        the minority class over the the number of samples in
-        the majority class. Thus the ratio should be in the interval [0., 1.]
+    ratio : str, dict, or callable, optional (default='auto')
+        Ratio to use for resampling the data set.
+
+        - If ``dict``, the keys correspond to the targeted classes. The values
+          correspond to the desired number of samples.
+        - If callable, function taking ``y`` and returns a ``dict``. The keys
+          correspond to the targeted classes. The values correspond to the
+          desired number of samples.
 
     min_c_ : str or int, optional (default=None)
         The identifier of the class to be the minority class.
-        If None, min_c_ is set to be the current minority class.
+        If ``None``, ``min_c_`` is set to be the current minority class.
+        Only used when ``ratio`` is a float for back-compatibility.
+
+        .. deprecated:: 0.2
+           ``min_c_`` is deprecated in 0.2 and will be removed in 0.4. Use
+           ``ratio`` by passing a ``dict`` instead.
 
     random_state : int, RandomState instance or None, optional (default=None)
         If int, random_state is the seed used by the random number generator;
@@ -51,48 +64,57 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None):
     y_resampled : ndarray, shape (n_samples_new)
         The corresponding label of `X_resampled`
 
-    """
-    if isinstance(ratio, float):
-        if ratio > 1:
-            raise ValueError('Ratio cannot be greater than one.'
-                             ' Got {}.'.format(ratio))
-        elif ratio <= 0:
-            raise ValueError('Ratio have to be strictly positive.'
-                             ' Got {}.'.format(ratio))
-    else:
-        raise ValueError('Ratio must be a float between 0.0 < ratio < 1.0'
-                         ' Got {} instead.'.format(ratio))
+    Examples
+    --------
+    >>> from collections import Counter
+    >>> from sklearn.datasets import load_iris
+    >>> from imblearn.datasets import make_imbalance
+
+    >>> data = load_iris()
+    >>> X, y = data.data, data.target
+    >>> print('Distribution before imbalancing: {}'.format(Counter(y)))
+    Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50})
+    >>> X_res, y_res = make_imbalance(X, y, ratio={0: 10, 1: 20, 2: 30},
+    ...                               random_state=42)
+    >>> print('Distribution after imbalancing: {}'.format(Counter(y_res)))
+    Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
 
+    """
     X, y = check_X_y(X, y)
-
-    random_state = check_random_state(random_state)
-
-    stats_c_ = Counter(y)
+    target_stats = Counter(y)
+    # restrict ratio to be a dict or a callable
+    if isinstance(ratio, dict) or callable(ratio):
+        ratio_ = check_ratio(ratio, y, 'under-sampling')
+    # FIXME: deprecated in 0.2 to be removed in 0.4
+    elif isinstance(ratio, Real):
+        if min_c_ is None:
+            min_c_ = min(target_stats, key=target_stats.get)
+        else:
+            warnings.warn("'min_c_' is deprecated in 0.2 and will be removed"
+                          " in 0.4. Use 'ratio' as dictionary instead.",
+                          DeprecationWarning)
+        warnings.warn("'ratio' being a float is deprecated in 0.2 and will not"
+                      " be supported in 0.4. Use a dictionary instead.",
+                      DeprecationWarning)
+        class_majority = max(target_stats, key=target_stats.get)
+        ratio_ = {}
+        for class_sample, n_sample in target_stats.items():
+            if class_sample == min_c_:
+                n_min_samples = int(target_stats[class_majority] * ratio)
+                ratio_[class_sample] = n_min_samples
+            else:
+                ratio_[class_sample] = n_sample
+        ratio_ = check_ratio(ratio_, y, 'under-sampling')
+    else:
+        raise ValueError("'ratio' has to be a dictionary or a function"
+                         " returning a dictionary. Got {} instead.".format(
+                             type(ratio)))
 
     LOGGER.info('The original target distribution in the dataset is: %s',
-                stats_c_)
-
-    if min_c_ is None:
-        min_c_ = min(stats_c_, key=stats_c_.get)
-
-    n_min_samples = int(np.count_nonzero(y != min_c_) * ratio)
-    if n_min_samples > stats_c_[min_c_]:
-        raise ValueError('Current imbalance ratio of data is lower than'
-                         ' desired ratio! Got {} > {}.'.format(
-                             n_min_samples, stats_c_[min_c_]))
-    if n_min_samples == 0:
-        raise ValueError('Not enough samples for desired ratio!'
-                         ' Got {}.'.format(n_min_samples))
-
-    mask = y == min_c_
-
-    idx_maj = np.where(~mask)[0]
-    idx_min = np.where(mask)[0]
-    idx_min = random_state.choice(idx_min, size=n_min_samples, replace=False)
-    idx = np.concatenate((idx_min, idx_maj), axis=0)
-
-    X_resampled, y_resampled = X[idx, :], y[idx]
-
+                target_stats)
+    rus = RandomUnderSampler(ratio=ratio_, replacement=False,
+                             random_state=random_state)
+    X_resampled, y_resampled = rus.fit_sample(X, y)
     LOGGER.info('Make the dataset imbalanced: %s', Counter(y_resampled))
 
     return X_resampled, y_resampled
diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py
@@ -0,0 +1,62 @@
+"""Test the module easy ensemble."""
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+#          Christos Aridas
+# License: MIT
+
+
+from __future__ import print_function
+
+from collections import Counter
+
+import numpy as np
+
+from sklearn.datasets import load_iris
+from sklearn.utils.testing import (assert_equal, assert_raises_regex,
+                                   assert_warns_message)
+
+from imblearn.datasets import make_imbalance
+
+data = load_iris()
+X, Y = data.data, data.target
+
+
+def test_make_imbalance_error():
+    # we are reusing part of utils.check_ratio, however this is not cover in
+    # the common tests so we will repeat it here
+    ratio = {0: -100, 1: 50, 2: 50}
+    assert_raises_regex(ValueError, "in a class cannot be negative",
+                        make_imbalance, X, Y, ratio)
+    ratio = {0: 10, 1: 70}
+    assert_raises_regex(ValueError, "should be less or equal to the original",
+                        make_imbalance, X, Y, ratio)
+    y_ = np.zeros((X.shape[0], ))
+    ratio = {0: 10}
+    assert_raises_regex(ValueError, "needs to have more than 1 class.",
+                        make_imbalance, X, y_, ratio)
+    ratio = 'random-string'
+    assert_raises_regex(ValueError, "has to be a dictionary or a function",
+                        make_imbalance, X, Y, ratio)
+
+
+# FIXME: to be removed in 0.4 due to deprecation
+def test_make_imbalance_float():
+    X_, y_ = assert_warns_message(DeprecationWarning,
+                                  "'min_c_' is deprecated in 0.2",
+                                  make_imbalance, X, Y, ratio=0.5, min_c_=1)
+    X_, y_ = assert_warns_message(DeprecationWarning,
+                                  "'ratio' being a float is deprecated",
+                                  make_imbalance, X, Y, ratio=0.5, min_c_=1)
+    assert_equal(Counter(y_), {0: 50, 1: 25, 2: 50})
+    # resample without using min_c_
+    X_, y_ = make_imbalance(X_, y_, ratio=0.25, min_c_=None)
+    assert_equal(Counter(y_), {0: 50, 1: 12, 2: 50})
+
+
+def test_make_imbalance_dict():
+    ratio = {0: 10, 1: 20, 2: 30}
+    X_, y_ = make_imbalance(X, Y, ratio=ratio)
+    assert_equal(Counter(y_), ratio)
+
+    ratio = {0: 10, 1: 20}
+    X_, y_ = make_imbalance(X, Y, ratio=ratio)
+    assert_equal(Counter(y_), {0: 10, 1: 20, 2: 50})