diff --git a/doc/whats_new.rst b/doc/whats_new.rst index b5f7cc2f2..656fb04e9 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -12,9 +12,13 @@ Changelog Bug fixes --------- +- Fixed a bug in :func:`utils.check_ratio` such that an error is raised when + the number of samples required is negative. By `Guillaume Lemaitre`_. + - Fixed a bug in :class:`under_sampling.NearMiss` version 3. The indices returned were wrong. By `Guillaume Lemaitre`_. -- fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN` + +- Fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN` and :class:`SMOTETomek. By `Guillaume Lemaitre`_.` New features @@ -22,18 +26,25 @@ New features - Turn off steps in :class:`pipeline.Pipeline` using the `None` object. By `Christos Aridas`_. + - Add a fetching function `datasets.fetch_datasets` in order to get some imbalanced datasets useful for benchmarking. By `Guillaume Lemaitre`_. Enhancement ~~~~~~~~~~~ +- :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It + supports multiclass. By `Guillaume Lemaitre`_. + - All the unit tests have been factorized and a `check_estimators` has been derived from scikit-learn. By `Guillaume Lemaitre`_. + - Script for automatic build of conda packages and uploading. By `Guillaume Lemaitre`_ + - Remove seaborn dependence and improve the examples. By `Guillaume Lemaitre`_. + - adapt all classes to multi-class resampling. By `Guillaume Lemaitre`_ API changes summary @@ -41,19 +52,29 @@ API changes summary - `__init__` has been removed from the :class:`base.SamplerMixin` to create a real mixin class. By `Guillaume Lemaitre`_. + - creation of a module `exceptions` to handle consistant raising of errors. By `Guillaume Lemaitre`_. + - creation of a module `utils.validation` to make checking of recurrent patterns. By `Guillaume Lemaitre`_. + - move the under-sampling methods in `prototype_selection` and `prototype_generation` submodule to make a clearer dinstinction. By `Guillaume Lemaitre`_. + - change `ratio` such that it can adapt to multiple class problems. By `Guillaume Lemaitre`_. Deprecation ~~~~~~~~~~~ +- Deprecation of the use of ``min_c_`` in :func:`datasets.make_imbalance`. By + `Guillaume Lemaitre`_ + +- Deprecation of the use of float in :func:`datasets.make_imbalance` for the + ratio parameter. By `Guillaume Lemaitre`_. + - deprecate the use of float as ratio in favor of dictionary, string, or callable. By `Guillaume Lemaitre`_. diff --git a/imblearn/base.py b/imblearn/base.py index a22c8e0e0..af3d0536d 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -11,7 +11,7 @@ from sklearn.base import BaseEstimator from sklearn.externals import six -from sklearn.utils import check_X_y, check_random_state +from sklearn.utils import check_X_y from sklearn.utils.validation import check_is_fitted from .utils import check_ratio, check_target_type, hash_X_y diff --git a/imblearn/datasets/imbalance.py b/imblearn/datasets/imbalance.py index 6c0a9f8e1..ca52e500d 100644 --- a/imblearn/datasets/imbalance.py +++ b/imblearn/datasets/imbalance.py @@ -7,10 +7,14 @@ # License: MIT import logging +import warnings from collections import Counter +from numbers import Real -import numpy as np -from sklearn.utils import check_random_state, check_X_y +from sklearn.utils import check_X_y + +from ..under_sampling.prototype_selection import RandomUnderSampler +from ..utils import check_ratio LOGGER = logging.getLogger(__name__) @@ -28,14 +32,23 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None): y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. - ratio : float, - The desired ratio given by the number of samples in - the minority class over the the number of samples in - the majority class. Thus the ratio should be in the interval [0., 1.] + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. min_c_ : str or int, optional (default=None) The identifier of the class to be the minority class. - If None, min_c_ is set to be the current minority class. + If ``None``, ``min_c_`` is set to be the current minority class. + Only used when ``ratio`` is a float for back-compatibility. + + .. deprecated:: 0.2 + ``min_c_`` is deprecated in 0.2 and will be removed in 0.4. Use + ``ratio`` by passing a ``dict`` instead. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; @@ -51,48 +64,57 @@ def make_imbalance(X, y, ratio, min_c_=None, random_state=None): y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` - """ - if isinstance(ratio, float): - if ratio > 1: - raise ValueError('Ratio cannot be greater than one.' - ' Got {}.'.format(ratio)) - elif ratio <= 0: - raise ValueError('Ratio have to be strictly positive.' - ' Got {}.'.format(ratio)) - else: - raise ValueError('Ratio must be a float between 0.0 < ratio < 1.0' - ' Got {} instead.'.format(ratio)) + Examples + -------- + >>> from collections import Counter + >>> from sklearn.datasets import load_iris + >>> from imblearn.datasets import make_imbalance + + >>> data = load_iris() + >>> X, y = data.data, data.target + >>> print('Distribution before imbalancing: {}'.format(Counter(y))) + Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50}) + >>> X_res, y_res = make_imbalance(X, y, ratio={0: 10, 1: 20, 2: 30}, + ... random_state=42) + >>> print('Distribution after imbalancing: {}'.format(Counter(y_res))) + Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10}) + """ X, y = check_X_y(X, y) - - random_state = check_random_state(random_state) - - stats_c_ = Counter(y) + target_stats = Counter(y) + # restrict ratio to be a dict or a callable + if isinstance(ratio, dict) or callable(ratio): + ratio_ = check_ratio(ratio, y, 'under-sampling') + # FIXME: deprecated in 0.2 to be removed in 0.4 + elif isinstance(ratio, Real): + if min_c_ is None: + min_c_ = min(target_stats, key=target_stats.get) + else: + warnings.warn("'min_c_' is deprecated in 0.2 and will be removed" + " in 0.4. Use 'ratio' as dictionary instead.", + DeprecationWarning) + warnings.warn("'ratio' being a float is deprecated in 0.2 and will not" + " be supported in 0.4. Use a dictionary instead.", + DeprecationWarning) + class_majority = max(target_stats, key=target_stats.get) + ratio_ = {} + for class_sample, n_sample in target_stats.items(): + if class_sample == min_c_: + n_min_samples = int(target_stats[class_majority] * ratio) + ratio_[class_sample] = n_min_samples + else: + ratio_[class_sample] = n_sample + ratio_ = check_ratio(ratio_, y, 'under-sampling') + else: + raise ValueError("'ratio' has to be a dictionary or a function" + " returning a dictionary. Got {} instead.".format( + type(ratio))) LOGGER.info('The original target distribution in the dataset is: %s', - stats_c_) - - if min_c_ is None: - min_c_ = min(stats_c_, key=stats_c_.get) - - n_min_samples = int(np.count_nonzero(y != min_c_) * ratio) - if n_min_samples > stats_c_[min_c_]: - raise ValueError('Current imbalance ratio of data is lower than' - ' desired ratio! Got {} > {}.'.format( - n_min_samples, stats_c_[min_c_])) - if n_min_samples == 0: - raise ValueError('Not enough samples for desired ratio!' - ' Got {}.'.format(n_min_samples)) - - mask = y == min_c_ - - idx_maj = np.where(~mask)[0] - idx_min = np.where(mask)[0] - idx_min = random_state.choice(idx_min, size=n_min_samples, replace=False) - idx = np.concatenate((idx_min, idx_maj), axis=0) - - X_resampled, y_resampled = X[idx, :], y[idx] - + target_stats) + rus = RandomUnderSampler(ratio=ratio_, replacement=False, + random_state=random_state) + X_resampled, y_resampled = rus.fit_sample(X, y) LOGGER.info('Make the dataset imbalanced: %s', Counter(y_resampled)) return X_resampled, y_resampled diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py new file mode 100644 index 000000000..f0cd420c1 --- /dev/null +++ b/imblearn/datasets/tests/test_imbalance.py @@ -0,0 +1,62 @@ +"""Test the module easy ensemble.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + + +from __future__ import print_function + +from collections import Counter + +import numpy as np + +from sklearn.datasets import load_iris +from sklearn.utils.testing import (assert_equal, assert_raises_regex, + assert_warns_message) + +from imblearn.datasets import make_imbalance + +data = load_iris() +X, Y = data.data, data.target + + +def test_make_imbalance_error(): + # we are reusing part of utils.check_ratio, however this is not cover in + # the common tests so we will repeat it here + ratio = {0: -100, 1: 50, 2: 50} + assert_raises_regex(ValueError, "in a class cannot be negative", + make_imbalance, X, Y, ratio) + ratio = {0: 10, 1: 70} + assert_raises_regex(ValueError, "should be less or equal to the original", + make_imbalance, X, Y, ratio) + y_ = np.zeros((X.shape[0], )) + ratio = {0: 10} + assert_raises_regex(ValueError, "needs to have more than 1 class.", + make_imbalance, X, y_, ratio) + ratio = 'random-string' + assert_raises_regex(ValueError, "has to be a dictionary or a function", + make_imbalance, X, Y, ratio) + + +# FIXME: to be removed in 0.4 due to deprecation +def test_make_imbalance_float(): + X_, y_ = assert_warns_message(DeprecationWarning, + "'min_c_' is deprecated in 0.2", + make_imbalance, X, Y, ratio=0.5, min_c_=1) + X_, y_ = assert_warns_message(DeprecationWarning, + "'ratio' being a float is deprecated", + make_imbalance, X, Y, ratio=0.5, min_c_=1) + assert_equal(Counter(y_), {0: 50, 1: 25, 2: 50}) + # resample without using min_c_ + X_, y_ = make_imbalance(X_, y_, ratio=0.25, min_c_=None) + assert_equal(Counter(y_), {0: 50, 1: 12, 2: 50}) + + +def test_make_imbalance_dict(): + ratio = {0: 10, 1: 20, 2: 30} + X_, y_ = make_imbalance(X, Y, ratio=ratio) + assert_equal(Counter(y_), ratio) + + ratio = {0: 10, 1: 20} + X_, y_ = make_imbalance(X, Y, ratio=ratio) + assert_equal(Counter(y_), {0: 10, 1: 20, 2: 50}) diff --git a/imblearn/datasets/tests/test_make_imbalance.py b/imblearn/datasets/tests/test_make_imbalance.py deleted file mode 100644 index f94fd51e1..000000000 --- a/imblearn/datasets/tests/test_make_imbalance.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Test the module easy ensemble.""" -# Authors: Guillaume Lemaitre -# Christos Aridas -# License: MIT - - -from __future__ import print_function - -from collections import Counter - -import numpy as np -from sklearn.utils.testing import (assert_true, assert_equal, - assert_raises_regex) - -from imblearn.datasets import make_imbalance - -# Generate a global dataset to use -X = np.random.random((1000, 2)) -Y = np.zeros(1000) -Y[500:] = 1 - - -def test_make_imbalance_bad_ratio(): - min_c_ = 1 - - # Define a zero ratio - ratio = 0.0 - assert_raises_regex(ValueError, "Ratio have to be strictly positive", - make_imbalance, X, Y, ratio, min_c_) - - # Define a negative ratio - ratio = -2.0 - assert_raises_regex(ValueError, "Ratio have to be strictly positive", - make_imbalance, X, Y, ratio, min_c_) - - # Define a ratio greater than 1 - ratio = 2.0 - assert_raises_regex(ValueError, "Ratio cannot be greater than one", - make_imbalance, X, Y, ratio, min_c_) - - # Define ratio as a list which is not supported - ratio = [.5, .5] - assert_raises_regex(ValueError, "Ratio must be a float between", - make_imbalance, X, Y, ratio, min_c_) - - -def test_make_imbalance_invalid_ratio(): - y_ = np.zeros((X.shape[0], )) - y_[0] = 1 - - ratio = 0.5 - assert_raises_regex(ValueError, "Current imbalance ratio of data", - make_imbalance, X, y_, ratio) - - -def test_make_imbalance_single_class(): - y_ = np.zeros((X.shape[0], )) - ratio = 0.5 - assert_raises_regex(ValueError, "Not enough samples for desired ratio!", - make_imbalance, X, y_, ratio) - - -def test_make_imbalance_1(): - X_, y_ = make_imbalance(X, Y, ratio=0.5, min_c_=1) - counter = Counter(y_) - assert_equal(counter[0], 500) - assert_equal(counter[1], 250) - assert_true(np.all([X_i in X for X_i in X_])) - - -def test_make_imbalance_2(): - X_, y_ = make_imbalance(X, Y, ratio=0.25, min_c_=1) - counter = Counter(y_) - assert_equal(counter[0], 500) - assert_equal(counter[1], 125) - assert_true(np.all([X_i in X for X_i in X_])) - - -def test_make_imbalance_3(): - X_, y_ = make_imbalance(X, Y, ratio=0.1, min_c_=1) - counter = Counter(y_) - assert_equal(counter[0], 500) - assert_equal(counter[1], 50) - assert_true(np.all([X_i in X for X_i in X_])) - - -def test_make_imbalance_4(): - X_, y_ = make_imbalance(X, Y, ratio=0.01, min_c_=1) - counter = Counter(y_) - assert_equal(counter[0], 500) - assert_equal(counter[1], 5) - assert_true(np.all([X_i in X for X_i in X_])) - - -def test_make_imbalance_5(): - X_, y_ = make_imbalance(X, Y, ratio=0.01, min_c_=0) - counter = Counter(y_) - assert_equal(counter[1], 500) - assert_equal(counter[0], 5) - assert_true(np.all([X_i in X for X_i in X_])) - - -def test_make_imbalance_multiclass(): - # Make y to be multiclass - y_ = np.zeros(1000) - y_[100:500] = 1 - y_[500:] = 2 - - # Resample the data - X_, y_ = make_imbalance(X, y_, ratio=0.1, min_c_=0) - counter = Counter(y_) - assert_equal(counter[0], 90) - assert_equal(counter[1], 400) - assert_equal(counter[2], 500) - assert_true(np.all([X_i in X for X_i in X_])) diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index 9817d4db0..718b7675b 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -48,7 +48,7 @@ class RandomUnderSampler(BaseUnderSampler): ``RandomState`` instance used by ``np.random``. replacement : boolean, optional (default=False) - Whether the sample is with (default) or without replacement. + Whether the sample is with or without replacement. Notes ----- diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index a17337de1..eb9b6272e 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -99,6 +99,9 @@ def test_ratio_minority_under_sampling(): def test_ratio_dict_error(): y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = {1: -100, 2: 50, 3: 25} + assert_raises_regex(ValueError, "in a class cannot be negative.", + check_ratio, ratio, y, 'under-sampling') ratio = {10: 10} assert_raises_regex(ValueError, "are not present in the data.", check_ratio, ratio, y, 'over-sampling') diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 002c05abf..04571ad1a 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -190,6 +190,11 @@ def _ratio_dict(ratio, y, sampling_type): if len(set_diff_ratio_target) > 0: raise ValueError("The {} target class is/are not present in the" " data.".format(set_diff_ratio_target)) + # check that there is no negative number + if any(n_samples < 0 for n_samples in ratio.values()): + raise ValueError("The number of samples in a class cannot be negative." + "'ratio' contains some negative value: {}".format( + ratio)) ratio_ = {} if sampling_type == 'over-sampling': n_samples_majority = max(target_stats.values())