From dae3ba387dc8b04373a59a6326e5fbd24f983250 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Mar 2018 02:03:46 +0100 Subject: [PATCH 01/50] EHN accept one-vs-all targets --- imblearn/__init__.py | 3 +++ imblearn/base.py | 19 ++++++++++++++++--- imblearn/utils/validation.py | 28 +++++++++++++++++++++------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/imblearn/__init__.py b/imblearn/__init__.py index 9f05adb1f..7803ca016 100644 --- a/imblearn/__init__.py +++ b/imblearn/__init__.py @@ -13,6 +13,9 @@ exceptions Module including custom warnings and error clases used across imbalanced-learn. +keras + Module which provides custom generator, layers for deep learning using + keras. metrics Module which provides metrics to quantified the classification performance with imbalanced dataset. diff --git a/imblearn/base.py b/imblearn/base.py index aa12eb365..d352d9696 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -9,9 +9,13 @@ import logging from abc import ABCMeta, abstractmethod +import numpy as np + from sklearn.base import BaseEstimator from sklearn.externals import six +from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y +from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import check_is_fitted from .utils import check_ratio, check_target_type, hash_X_y @@ -54,14 +58,23 @@ def sample(self, X, y): The corresponding label of `X_resampled` """ - # Check the consistency of X and y + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'ratio_') self._check_X_y(X, y) - return self._sample(X, y) + output = self._sample(X, y) + + if binarize_y: + y_sampled = label_binarize(output[1], np.unique(y)) + if len(output) == 2: + return output[0], y_sampled + else: + return output[0], y_sampled, output[2] + else: + return output def fit_sample(self, X, y): """Fit the statistics and resample the data directly. @@ -152,8 +165,8 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.X_hash_, self.y_hash_ = hash_X_y(X, y) # self.sampling_type is already checked in check_ratio self.ratio_ = check_ratio(self.ratio, y, self._sampling_type) diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index d009dacbe..17fa7e93a 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -10,6 +10,7 @@ import numpy as np +from sklearn.preprocessing import label_binarize from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.externals import six, joblib @@ -19,7 +20,7 @@ SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling', 'ensemble') -TARGET_KIND = ('binary', 'multiclass') +TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator') def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): @@ -54,29 +55,42 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) -def check_target_type(y): +def check_target_type(y, indicate_one_vs_all=False): """Check the target types to be conform to the current samplers. - The current samplers should be compatible with ``'binary'`` and - ``'multiclass'`` targets only. + The current samplers should be compatible with ``'binary'``, + ``'multilabel-indicator'`` and ``'multiclass'`` targets only. Parameters ---------- y : ndarray, - The array containing the target + The array containing the target. + + indicate_one_vs_all : bool, optional + Either to indicate if the targets are encoded in a one-vs-all fashion. Returns ------- y : ndarray, The returned target. + is_one_vs_all : bool, optional + Indicate if the target was originally encoded in a one-vs-all fashion. + Only returned if ``indicate_multilabel=True``. + """ - if type_of_target(y) not in TARGET_KIND: + type_y = type_of_target(y) + if type_y not in TARGET_KIND: # FIXME: perfectly we should raise an error but the sklearn API does # not allow for it warnings.warn("'y' should be of types {} only. Got {} instead.".format( TARGET_KIND, type_of_target(y))) - return y + + if indicate_one_vs_all: + return (y.argmax(axis=1) if type_y == 'multilabel-indicator' else y, + type_y == 'multilabel-indicator') + else: + return y.argmax(axis=1) if type_y == 'multilabel-indicator' else y def hash_X_y(X, y, n_samples=10, n_features=5): From 7487ce48d7037823af4253064e8a7460b9594c50 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Mar 2018 02:29:49 +0100 Subject: [PATCH 02/50] TST add test for check_target_type --- imblearn/utils/tests/test_validation.py | 32 +++++++++++++++++++++++++ imblearn/utils/validation.py | 1 - 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index a1d8585df..49e9d6997 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -6,17 +6,20 @@ from collections import Counter import numpy as np +import pytest from pytest import raises from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_random_state from sklearn.externals import joblib +from sklearn.utils.testing import assert_array_equal from imblearn.utils.testing import warns from imblearn.utils import check_neighbors_object from imblearn.utils import check_ratio from imblearn.utils import hash_X_y +from imblearn.utils import check_target_type def test_check_neighbors_object(): @@ -35,6 +38,35 @@ def test_check_neighbors_object(): check_neighbors_object(name, n_neighbors) +@pytest.mark.parametrize( + "target, output_target", + [(np.array([0, 1, 1]), np.array([0, 1, 1])), + (np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array([[0, 1], [1, 0]]), np.array([1, 0]))] +) +def test_check_target_type(target, output_target): + converted_target = check_target_type(target.astype(int)) + assert_array_equal(converted_target, output_target.astype(int)) + + +@pytest.mark.parametrize( + "target, output_target, is_ova", + [(np.array([0, 1, 1]), np.array([0, 1, 1]), False), + (np.array([0, 1, 2]), np.array([0, 1, 2]), False), + (np.array([[0, 1], [1, 0]]), np.array([1, 0]), True)] +) +def test_check_target_type_ova(target, output_target, is_ova): + converted_target, binarize_target = check_target_type( + target.astype(int), indicate_one_vs_all=True) + assert_array_equal(converted_target, output_target.astype(int)) + assert binarize_target == is_ova + + +def test_check_target_warning(): + target = np.arange(4).reshape((2, 2)) + with pytest.warns(UserWarning, message='should be of types'): + check_target_type(target) + def test_check_ratio_error(): with raises(ValueError, match="'sampling_type' should be one of"): check_ratio('auto', np.array([1, 2, 3]), 'rnd') diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 17fa7e93a..58488463a 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -10,7 +10,6 @@ import numpy as np -from sklearn.preprocessing import label_binarize from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.externals import six, joblib From 05ae2e6ff833f033086b31b4548545f0f31c4157 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Mar 2018 02:30:45 +0100 Subject: [PATCH 03/50] PEP8 --- imblearn/utils/tests/test_validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 49e9d6997..64a23eb4a 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -67,6 +67,7 @@ def test_check_target_warning(): with pytest.warns(UserWarning, message='should be of types'): check_target_type(target) + def test_check_ratio_error(): with raises(ValueError, match="'sampling_type' should be one of"): check_ratio('auto', np.array([1, 2, 3]), 'rnd') From 05b7d65798e1baad01316f3e1b9f1913f7bf7cad Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Mar 2018 02:33:55 +0100 Subject: [PATCH 04/50] TST fix pytests match warns --- imblearn/utils/tests/test_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 64a23eb4a..bed62617d 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -64,7 +64,7 @@ def test_check_target_type_ova(target, output_target, is_ova): def test_check_target_warning(): target = np.arange(4).reshape((2, 2)) - with pytest.warns(UserWarning, message='should be of types'): + with pytest.warns(UserWarning, match='should be of types'): check_target_type(target) From 1a27e3e80b6020fe63dfc922821a86adac36a5c7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Mar 2018 03:26:24 +0100 Subject: [PATCH 05/50] TST common test to check multiclass ova equality --- imblearn/base.py | 29 +++++++++++++++------------- imblearn/combine/smote_enn.py | 2 +- imblearn/combine/smote_tomek.py | 3 +-- imblearn/ensemble/balance_cascade.py | 3 ++- imblearn/utils/estimator_checks.py | 18 +++++++++++++++++ 5 files changed, 38 insertions(+), 17 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index d352d9696..a44831c0b 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -15,7 +15,6 @@ from sklearn.externals import six from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y -from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import check_is_fitted from .utils import check_ratio, check_target_type, hash_X_y @@ -245,17 +244,10 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None): self.kw_args = kw_args self.logger = logging.getLogger(__name__) - def _check_X_y(self, X, y): - if self.accept_sparse: - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - else: - X, y = check_X_y(X, y, accept_sparse=False) - y = check_target_type(y) - - return X, y - def fit(self, X, y): - X, y = self._check_X_y(X, y) + y = check_target_type(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] + if self.accept_sparse else False) self.X_hash_, self.y_hash_ = hash_X_y(X, y) # when using a sampler, ratio_ is supposed to exist after fit self.ratio_ = 'is_fitted' @@ -263,7 +255,9 @@ def fit(self, X, y): return self def _sample(self, X, y, func=None, kw_args=None): - X, y = self._check_X_y(X, y) + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] + if self.accept_sparse else False) check_is_fitted(self, 'ratio_') X_hash, y_hash = hash_X_y(X, y) if self.X_hash_ != X_hash or self.y_hash_ != y_hash: @@ -272,7 +266,16 @@ def _sample(self, X, y, func=None, kw_args=None): if func is None: func = _identity - return func(X, y, **(kw_args if self.kw_args else {})) + output = func(X, y, **(kw_args if self.kw_args else {})) + + if binarize_y: + y_sampled = label_binarize(output[1], np.unique(y)) + if len(output) == 2: + return output[0], y_sampled + else: + return output[0], y_sampled, output[2] + else: + return output def sample(self, X, y): return self._sample(X, y, func=self.func, kw_args=self.kw_args) diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 74420472b..470919878 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -144,8 +144,8 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index b48e6510a..0748e6ef7 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -8,7 +8,6 @@ from __future__ import division import logging -import warnings from sklearn.utils import check_X_y @@ -153,8 +152,8 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index bc6a06c6f..6668209ea 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -14,7 +14,7 @@ from sklearn.model_selection import cross_val_predict from .base import BaseEnsembleSampler -from ..utils import check_ratio +from ..utils import check_ratio, check_target_type class BalanceCascade(BaseEnsembleSampler): @@ -137,6 +137,7 @@ def fit(self, X, y): """ super(BalanceCascade, self).fit(X, y) + y = check_target_type(y) self.ratio_ = check_ratio(self.ratio, y, 'under-sampling') return self diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 2184b4b12..bd37e408a 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -19,11 +19,13 @@ from sklearn.datasets import make_classification from sklearn.cluster import KMeans +from sklearn.preprocessing import label_binarize from sklearn.utils.estimator_checks import check_estimator \ as sklearn_check_estimator, check_parameters_default_constructible from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import set_random_state +from sklearn.utils.multiclass import type_of_target from imblearn.over_sampling.base import BaseOverSampler from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler @@ -44,6 +46,7 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_ratio_fit_sample yield check_samplers_sparse yield check_samplers_pandas + yield check_samplers_multiclass_ova def _yield_all_checks(name, estimator): @@ -253,3 +256,18 @@ def check_samplers_pandas(name, Sampler): X_res, y_res = sampler.fit_sample(X, y) assert_allclose(X_res_pd, X_res) assert_allclose(y_res_pd, y_res) + + +def check_samplers_multiclass_ova(name, Sampler): + # Check that multiclass target lead to the same results than OVA encoding + X, y = make_classification(n_samples=1000, n_classes=3, + n_informative=4, weights=[0.2, 0.3, 0.5], + random_state=0) + y_ova = label_binarize(y, np.unique(y)) + sampler = Sampler() + set_random_state(sampler) + X_res, y_res = sampler.fit_sample(X, y) + X_res_ova, y_res_ova = sampler.fit_sample(X, y_ova) + assert_allclose(X_res, X_res_ova) + assert type_of_target(y_res_ova) == type_of_target(y_ova) + assert_allclose(y_res, y_res_ova.argmax(axis=1)) From 47bbbf41cb2be4748f962141f88c74d7227d7147 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 4 Apr 2018 17:07:07 +0200 Subject: [PATCH 06/50] Add keras module --- imblearn/keras/__init__.py | 6 ++++++ imblearn/keras/generator.py | 5 +++++ imblearn/keras/tests/__init__.py | 0 3 files changed, 11 insertions(+) create mode 100644 imblearn/keras/__init__.py create mode 100644 imblearn/keras/generator.py create mode 100644 imblearn/keras/tests/__init__.py diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py new file mode 100644 index 000000000..cf8949267 --- /dev/null +++ b/imblearn/keras/__init__.py @@ -0,0 +1,6 @@ +"""The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset +in keras.""" + +from .generator import balanced_batch_generator + +__all__ = ['balanced_batch_generator'] diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py new file mode 100644 index 000000000..13b3d5642 --- /dev/null +++ b/imblearn/keras/generator.py @@ -0,0 +1,5 @@ +"""Implement generators which will balance data.""" + + +def balanced_batch_generator(): + pass diff --git a/imblearn/keras/tests/__init__.py b/imblearn/keras/tests/__init__.py new file mode 100644 index 000000000..e69de29bb From e6a318706eb0fecbb36c690d912d9dd7dee0a839 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Apr 2018 00:20:41 +0200 Subject: [PATCH 07/50] TST for generator class --- imblearn/keras/__init__.py | 6 ++- imblearn/keras/generator.py | 71 ++++++++++++++++++++++++++++-- imblearn/keras/tests/test_keras.py | 22 +++++++++ 3 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 imblearn/keras/tests/test_keras.py diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py index cf8949267..a605034c0 100644 --- a/imblearn/keras/__init__.py +++ b/imblearn/keras/__init__.py @@ -1,6 +1,8 @@ """The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset in keras.""" -from .generator import balanced_batch_generator +from .generator import BalancedBatchGenerator +# from .generator import balanced_batch_generator -__all__ = ['balanced_batch_generator'] +__all__ = ['BalancedBatchGenerator'] #, + # 'balanced_batch_generator'] diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py index 13b3d5642..ac3288f2b 100644 --- a/imblearn/keras/generator.py +++ b/imblearn/keras/generator.py @@ -1,5 +1,70 @@ -"""Implement generators which will balance data.""" +"""Implement generators for ``keras`` which will balance the data.""" +import keras -def balanced_batch_generator(): - pass +from sklearn.base import clone +from sklearn.utils import safe_indexing + +from ..under_sampling import RandomUnderSampler + + +class BalancedBatchGenerator(keras.utils.Sequence): + """ + + """ + def __init__(self, X, y, sampler=None, batch_size=64, stratify=True): + self.X = X + self.y = y + self.sampler = sampler + self.batch_size = batch_size + self.stratify = stratify + self._sample() + + def _sample(self): + if self.sampler is None: + self.sampler_ = RandomUnderSampler(return_indices=True) + else: + if not hasattr(self.sampler, 'return_indices'): + raise ValueError("'sampler' needs to return the indices of " + "the samples selected. Provide a sampler " + "which has an attribute 'return_indices'.") + self.sampler_ = clone(self.sampler) + self.sampler_.set_params(return_indices=True) + + _, _, self.indices_ = self.sampler_.fit_sample(self.X, self.y) + + def __len__(self): + return int(self.indices_.size // self.batch_size) + + def __getitem__(self, index): + return (safe_indexing(self.X, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]), + safe_indexing(self.y, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size])) + + +# def balanced_batch_generator(X, y, sampler=None, batch_size=64, +# stratify=True): +# """Create a balanced batch generator which can be plugged in +# ``keras.fit_genertor``. + +# Parameters +# ---------- + +# """ +# if sampler is None: +# sampler = RandomUnderSampler() +# else: +# if not hasattr(sampler, 'return_indices'): +# raise ValueError("'sampler' needs to return the indices of " +# "the samples selected. Provide a sampler which " +# "has an attribute 'return_indices'.") +# sampler.set_params(return_indices=True) + +# def generator(X=X, y=y, indices=indices, batch_size=batch_size, +# stratify=stratify): + + +# _, _, indices = sampler.fit_sample(X, y) diff --git a/imblearn/keras/tests/test_keras.py b/imblearn/keras/tests/test_keras.py new file mode 100644 index 000000000..bf741f504 --- /dev/null +++ b/imblearn/keras/tests/test_keras.py @@ -0,0 +1,22 @@ +from keras.models import Sequential +from keras.layers import Dense +from keras.utils import to_categorical + +from sklearn.datasets import load_iris + +from imblearn.keras import BalancedBatchGenerator + + +iris = load_iris() +X, y = iris.data, to_categorical(iris.target, 3) + + +def test_balanced_batch_generator(): + model = Sequential() + model.add(Dense(y.shape[1], input_dim=X.shape[1], activation='softmax')) + model.compile(optimizer='sgd', loss='categorical_crossentropy', + metrics=['accuracy']) + training_generator = BalancedBatchGenerator(X, y) + model.fit_generator(generator=training_generator, + epochs=10, + verbose=10) From 01492a688c3ac64ce6238200e550f98df318ef6f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Apr 2018 00:28:01 +0200 Subject: [PATCH 08/50] dependencies --- appveyor.yml | 3 ++- build_tools/circle/build_doc.sh | 2 +- build_tools/travis/install.sh | 3 ++- imblearn/keras/tests/test_keras.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index a09272080..82b79b2da 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -32,7 +32,8 @@ install: # Add Library/bin directory to fix issue # https://github.com/conda/conda/issues/1753 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PYTHON%\\Library\\bin;%PATH%" - - conda install pip scipy numpy scikit-learn=0.19 pandas -y -q + - conda install pip scipy numpy scikit-learn=0.19 -y -q + - conda install pandas keras -y -q - conda install pytest pytest-cov -y -q - conda install nose -y -q # FIXME: remove this line when using sklearn > 0.19 - pip install . diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index e49088ae6..4b4ff915e 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -92,7 +92,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python=3 source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy scikit-learn pillow matplotlib sphinx \ - sphinx_rtd_theme numpydoc + sphinx_rtd_theme numpydoc pandas keras pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git # Build and install imbalanced-learn in dev mode diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 415e4ce5d..3e3b062e8 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,8 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION pandas + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + conds install --yes pandas keras if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython diff --git a/imblearn/keras/tests/test_keras.py b/imblearn/keras/tests/test_keras.py index bf741f504..eecf5c3a3 100644 --- a/imblearn/keras/tests/test_keras.py +++ b/imblearn/keras/tests/test_keras.py @@ -4,9 +4,9 @@ from sklearn.datasets import load_iris +from imblearn.datasets import make_imbalance from imblearn.keras import BalancedBatchGenerator - iris = load_iris() X, y = iris.data, to_categorical(iris.target, 3) From 54a0f033e30652a816d442b266dfb2808ce4144e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Apr 2018 00:31:55 +0200 Subject: [PATCH 09/50] update dependencies --- build_tools/travis/install.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 3e3b062e8..41743ef9f 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -60,8 +60,9 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # Create a new virtualenv using system site packages for python, numpy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install scikit-learn pandas nose nose-timer pytest pytest-cov codecov \ - sphinx numpydoc + pip install scikit-learn + pip install pandas keras tensorflow + pip install nose nose-timer pytest pytest-cov codecov sphinx numpydoc fi From 77e944d3a6db60650bdcded8918a2f188fd62574 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 5 Apr 2018 00:32:24 +0200 Subject: [PATCH 10/50] iter --- build_tools/travis/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 41743ef9f..e094117cc 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -39,7 +39,7 @@ if [[ "$DISTRIB" == "conda" ]]; then conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION - conds install --yes pandas keras + conda install --yes pandas keras if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython From 7e138336da3d42b2ea3049a212870beeeb87578d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Apr 2018 01:06:28 +0200 Subject: [PATCH 11/50] TST test the keras class --- appveyor.yml | 3 +- imblearn/keras/generator.py | 75 ++++++++++++++++++++++++++++-- imblearn/keras/tests/test_keras.py | 41 +++++++++++++--- 3 files changed, 107 insertions(+), 12 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 82b79b2da..10cb95eea 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -33,7 +33,8 @@ install: # https://github.com/conda/conda/issues/1753 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PYTHON%\\Library\\bin;%PATH%" - conda install pip scipy numpy scikit-learn=0.19 -y -q - - conda install pandas keras -y -q + - conda install pandas -y -q + - conda install -c conda-forge keras -y -q - conda install pytest pytest-cov -y -q - conda install nose -y -q # FIXME: remove this line when using sklearn > 0.19 - pip install . diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py index ac3288f2b..1e77d1ffb 100644 --- a/imblearn/keras/generator.py +++ b/imblearn/keras/generator.py @@ -1,28 +1,90 @@ """Implement generators for ``keras`` which will balance the data.""" -import keras +try: + import keras +except ImportError: + # Skip the tests for the examples + import pytest + keras = pytest.importorskip('keras') + raise ImportError("To use the imblearn.keras module, you need to install " + "keras.") from sklearn.base import clone from sklearn.utils import safe_indexing +from sklearn.utils import check_random_state +from sklearn.utils.testing import set_random_state from ..under_sampling import RandomUnderSampler +# FIXME: add docstring for random_state using Substitution class BalancedBatchGenerator(keras.utils.Sequence): - """ + """Create balanced batches when training a keras model. + + Create a keras ``Sequence`` which is given to ``fit_generator``. The + sampler defines the sampling strategy used to balance the dataset ahead of + creating the batch. The sampler should have an attribute + ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sampler : object or None, optional (default=None) + A sampler instance which has an attribute ``return_indices``. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + {random_state} + + Attributes + ---------- + sampler_ : object + The sampler used to balance the dataset. + + indices_ : ndarray, shape (n_samples, n_features) + The indices of the samples selected during sampling. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) + >>> y = keras.utils.to_categorical(y, 3) + >>> import keras + >>> model = keras.models.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + >>> from imblearn.keras import BalancedBatchGenerator + >>> from imblearn.under_sampling import NearMiss + >>> training_generator = BalancedBatchGenerator( + ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... epochs=10, verbose=0) + """ - def __init__(self, X, y, sampler=None, batch_size=64, stratify=True): + def __init__(self, X, y, sampler=None, batch_size=32, random_state=None): self.X = X self.y = y self.sampler = sampler self.batch_size = batch_size - self.stratify = stratify + self.random_state = random_state self._sample() def _sample(self): + random_state = check_random_state(self.random_state) if self.sampler is None: - self.sampler_ = RandomUnderSampler(return_indices=True) + self.sampler_ = RandomUnderSampler(return_indices=True, + random_state=random_state) else: if not hasattr(self.sampler, 'return_indices'): raise ValueError("'sampler' needs to return the indices of " @@ -30,8 +92,11 @@ def _sample(self): "which has an attribute 'return_indices'.") self.sampler_ = clone(self.sampler) self.sampler_.set_params(return_indices=True) + set_random_state(self.sampler_, random_state) _, _, self.indices_ = self.sampler_.fit_sample(self.X, self.y) + # shuffle the indices since the sampler are packing them by class + random_state.shuffle(self.indices_) def __len__(self): return int(self.indices_.size // self.batch_size) diff --git a/imblearn/keras/tests/test_keras.py b/imblearn/keras/tests/test_keras.py index eecf5c3a3..2f789e8ec 100644 --- a/imblearn/keras/tests/test_keras.py +++ b/imblearn/keras/tests/test_keras.py @@ -1,3 +1,7 @@ +import pytest + +keras = pytest.importorskip('keras') + from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical @@ -6,17 +10,42 @@ from imblearn.datasets import make_imbalance from imblearn.keras import BalancedBatchGenerator +from imblearn.under_sampling import ClusterCentroids +from imblearn.under_sampling import NearMiss iris = load_iris() -X, y = iris.data, to_categorical(iris.target, 3) +X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) +y = to_categorical(y, 3) -def test_balanced_batch_generator(): +def _build_keras_model(n_classes, n_features): model = Sequential() - model.add(Dense(y.shape[1], input_dim=X.shape[1], activation='softmax')) + model.add(Dense(n_classes, input_dim=n_features, activation='softmax')) model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy']) - training_generator = BalancedBatchGenerator(X, y) + return model + + +def test_balanced_batch_generator_class_no_return_indices(): + model = _build_keras_model(y.shape[1], X.shape[1]) + with pytest.raises(ValueError, match='needs to return the indices'): + training_generator = BalancedBatchGenerator(X, y, + sampler=ClusterCentroids(), + batch_size=10, + random_state=42) + model.fit_generator(generator=training_generator, + epochs=10) + + +@pytest.mark.parametrize( + "sampler", + [None, NearMiss()] +) +def test_balanced_batch_generator_class(sampler): + model = _build_keras_model(y.shape[1], X.shape[1]) + training_generator = BalancedBatchGenerator(X, y, + sampler=sampler, + batch_size=10, + random_state=42) model.fit_generator(generator=training_generator, - epochs=10, - verbose=10) + epochs=10) From 182b40837bcc8738eb2632e1984cbb315e468324 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Apr 2018 09:13:19 +0200 Subject: [PATCH 12/50] optional dep windows --- appveyor.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 10cb95eea..ef06ad0ba 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,22 +10,27 @@ environment: - PYTHON: "C:\\Miniconda-x64" PYTHON_VERSION: "2.7.x" PYTHON_ARCH: "64" + OPTIONAL_DEP: "pandas" - PYTHON: "C:\\Miniconda" PYTHON_VERSION: "2.7.x" PYTHON_ARCH: "32" + OPTIONAL_DEP: "pandas" - PYTHON: "C:\\Miniconda35-x64" PYTHON_VERSION: "3.5.x" PYTHON_ARCH: "64" + OPTIONAL_DEP: "pandas keras tensorflow" - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6.x" PYTHON_ARCH: "64" + OPTIONAL_DEP: "pandas keras tensorflow" - PYTHON: "C:\\Miniconda36" PYTHON_VERSION: "3.6.x" PYTHON_ARCH: "32" + OPTIONAL_DEP: "pandas" install: # Prepend miniconda installed Python to the PATH of this build @@ -33,8 +38,7 @@ install: # https://github.com/conda/conda/issues/1753 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PYTHON%\\Library\\bin;%PATH%" - conda install pip scipy numpy scikit-learn=0.19 -y -q - - conda install pandas -y -q - - conda install -c conda-forge keras -y -q + - "conda install %OPTIONAL_DEP% -y -q" - conda install pytest pytest-cov -y -q - conda install nose -y -q # FIXME: remove this line when using sklearn > 0.19 - pip install . From 68f8454312d81b3882f5dcd7055086c6cf3058de Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Apr 2018 10:09:33 +0200 Subject: [PATCH 13/50] Add generator function --- imblearn/keras/__init__.py | 6 +- imblearn/keras/generator.py | 135 +++++++++++++++--- .../{test_keras.py => test_generator.py} | 40 +++++- 3 files changed, 152 insertions(+), 29 deletions(-) rename imblearn/keras/tests/{test_keras.py => test_generator.py} (56%) diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py index a605034c0..8acdd2b03 100644 --- a/imblearn/keras/__init__.py +++ b/imblearn/keras/__init__.py @@ -2,7 +2,7 @@ in keras.""" from .generator import BalancedBatchGenerator -# from .generator import balanced_batch_generator +from .generator import balanced_batch_generator -__all__ = ['BalancedBatchGenerator'] #, - # 'balanced_batch_generator'] +__all__ = ['BalancedBatchGenerator', + 'balanced_batch_generator'] diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py index 1e77d1ffb..b49489ac3 100644 --- a/imblearn/keras/generator.py +++ b/imblearn/keras/generator.py @@ -1,4 +1,5 @@ """Implement generators for ``keras`` which will balance the data.""" +from __future__ import division try: import keras @@ -34,6 +35,9 @@ class BalancedBatchGenerator(keras.utils.Sequence): y : ndarray, shape (n_samples,) or (n_samples, n_classes) Associated targets. + sample_weight : ndarray, shape (n_samples,) + Sample weight. + sampler : object or None, optional (default=None) A sampler instance which has an attribute ``return_indices``. @@ -70,11 +74,12 @@ class BalancedBatchGenerator(keras.utils.Sequence): >>> callback_history = model.fit_generator(generator=training_generator, ... epochs=10, verbose=0) - """ - def __init__(self, X, y, sampler=None, batch_size=32, random_state=None): + def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, + random_state=None): self.X = X self.y = y + self.sample_weight = sample_weight self.sampler = sampler self.batch_size = batch_size self.random_state = random_state @@ -102,34 +107,120 @@ def __len__(self): return int(self.indices_.size // self.batch_size) def __getitem__(self, index): - return (safe_indexing(self.X, + if self.sample_weight is None: + return ( + safe_indexing(self.X, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]), + safe_indexing(self.y, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + ) + else: + return ( + safe_indexing(self.X, self.indices_[index * self.batch_size: (index + 1) * self.batch_size]), safe_indexing(self.y, self.indices_[index * self.batch_size: - (index + 1) * self.batch_size])) + (index + 1) * self.batch_size]), + safe_indexing(self.sample_weight, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + ) + + +def balanced_batch_generator(X, y, sample_weight=None, sampler=None, + batch_size=32, random_state=None): + """Create a balanced batch generator to train keras model. + + Returns a generator --- as well as the number of step per epoch --- which + is given to ``fit_generator``. The sampler defines the sampling strategy + used to balance the dataset ahead of creating the batch. The sampler should + have an attribute ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=None) + A sampler instance which has an attribute ``return_indices``. + batch_size : int, optional (default=32) + Number of samples per gradient update. -# def balanced_batch_generator(X, y, sampler=None, batch_size=64, -# stratify=True): -# """Create a balanced batch generator which can be plugged in -# ``keras.fit_genertor``. + {random_state} -# Parameters -# ---------- + Returns + ------- + generator : generator of tuple + Generate batch of data. The tuple generated are either (X_batch, + y_batch) or (X_batch, y_batch, sampler_weight_batch). -# """ -# if sampler is None: -# sampler = RandomUnderSampler() -# else: -# if not hasattr(sampler, 'return_indices'): -# raise ValueError("'sampler' needs to return the indices of " -# "the samples selected. Provide a sampler which " -# "has an attribute 'return_indices'.") -# sampler.set_params(return_indices=True) + steps_per_epoch : int + The number of samples per epoch. Required by ``fit_generator`` in + keras. -# def generator(X=X, y=y, indices=indices, batch_size=batch_size, -# stratify=stratify): + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) + >>> y = keras.utils.to_categorical(y, 3) + >>> import keras + >>> model = keras.models.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + >>> from imblearn.keras import balanced_batch_generator + >>> from imblearn.under_sampling import NearMiss + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... steps_per_epoch=steps_per_epoch, + ... epochs=10, verbose=0) -# _, _, indices = sampler.fit_sample(X, y) + """ + random_state = check_random_state(random_state) + if sampler is None: + sampler_ = RandomUnderSampler(return_indices=True, + random_state=random_state) + else: + if not hasattr(sampler, 'return_indices'): + raise ValueError("'sampler' needs to return the indices of " + "the samples selected. Provide a sampler " + "which has an attribute 'return_indices'.") + sampler_ = clone(sampler) + sampler_.set_params(return_indices=True) + set_random_state(sampler_, random_state) + + _, _, indices = sampler_.fit_sample(X, y) + # shuffle the indices since the sampler are packing them by class + random_state.shuffle(indices) + + def generator(X, y, sample_weight, indices, batch_size): + if sample_weight is None: + while True: + for index in range(0, len(indices), batch_size): + yield (safe_indexing(X, indices[index:index + batch_size]), + safe_indexing(y, indices[index:index + batch_size])) + else: + while True: + for index in range(0, len(indices), batch_size): + yield (safe_indexing(X, indices[index:index + batch_size]), + safe_indexing(y, indices[index:index + batch_size]), + safe_indexing(sample_weight, + indices[index:index + batch_size])) + + return (generator(X, y, sample_weight, indices, batch_size), + int(indices.size // batch_size)) diff --git a/imblearn/keras/tests/test_keras.py b/imblearn/keras/tests/test_generator.py similarity index 56% rename from imblearn/keras/tests/test_keras.py rename to imblearn/keras/tests/test_generator.py index 2f789e8ec..ec1650b00 100644 --- a/imblearn/keras/tests/test_keras.py +++ b/imblearn/keras/tests/test_generator.py @@ -1,5 +1,7 @@ import pytest +import numpy as np + keras = pytest.importorskip('keras') from keras.models import Sequential @@ -9,10 +11,12 @@ from sklearn.datasets import load_iris from imblearn.datasets import make_imbalance -from imblearn.keras import BalancedBatchGenerator from imblearn.under_sampling import ClusterCentroids from imblearn.under_sampling import NearMiss +from imblearn.keras import BalancedBatchGenerator +from imblearn.keras import balanced_batch_generator + iris = load_iris() X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) y = to_categorical(y, 3) @@ -38,14 +42,42 @@ def test_balanced_batch_generator_class_no_return_indices(): @pytest.mark.parametrize( - "sampler", - [None, NearMiss()] + "sampler, sample_weight", + [(None, None), + (NearMiss(), None), + (None, np.random.uniform(size=(y.shape[0])))] ) -def test_balanced_batch_generator_class(sampler): +def test_balanced_batch_generator_class(sampler, sample_weight): model = _build_keras_model(y.shape[1], X.shape[1]) training_generator = BalancedBatchGenerator(X, y, + sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42) model.fit_generator(generator=training_generator, epochs=10) + + +def test_balanced_batch_generator_function_no_return_indices(): + model = _build_keras_model(y.shape[1], X.shape[1]) + with pytest.raises(ValueError, match='needs to return the indices'): + training_generator, sample_per_epoch = balanced_batch_generator( + X, y, sampler=ClusterCentroids(), batch_size=10, random_state=42) + model.fit_generator(generator=training_generator, + epochs=10) + + +@pytest.mark.parametrize( + "sampler, sample_weight", + [(None, None), + (NearMiss(), None), + (None, np.random.uniform(size=(y.shape[0])))] +) +def test_balanced_batch_generator_function(sampler, sample_weight): + model = _build_keras_model(y.shape[1], X.shape[1]) + training_generator, steps_per_epoch = balanced_batch_generator( + X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, + random_state=42) + model.fit_generator(generator=training_generator, + steps_per_epoch=steps_per_epoch, + epochs=10) From c7b1c4842fa467bd06eb385eb8ff0cecc140243a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Apr 2018 11:36:46 +0200 Subject: [PATCH 14/50] upload windows branch coverage --- appveyor.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index ef06ad0ba..ed88b09fb 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -47,3 +47,12 @@ test_script: - mkdir for_test - cd for_test - pytest --pyargs imblearn --cov-report term-missing --cov=imblearn + +after_test: + - if not exist dist mkdir dist + - if exist .coverage (cp .coverage dist\) else (echo no .coverage) + - codecov + - if exist coverage.xml (cp coverage.xml dist\) else (echo no coverage.xml) + +artifacts: + - path: dist\* From e3bb2f74deb33f21cc8e17e98575e1aade658621 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Apr 2018 11:43:13 +0200 Subject: [PATCH 15/50] add codecov on appveyor --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index ed88b09fb..caaa0e28e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -40,6 +40,7 @@ install: - conda install pip scipy numpy scikit-learn=0.19 -y -q - "conda install %OPTIONAL_DEP% -y -q" - conda install pytest pytest-cov -y -q + - pip install codecov - conda install nose -y -q # FIXME: remove this line when using sklearn > 0.19 - pip install . From 8afd2cd0a6bf92731c95f98bd63f6a3ea831d576 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Apr 2018 12:07:03 +0200 Subject: [PATCH 16/50] simplify codecov --- appveyor.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index caaa0e28e..6bb885553 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -50,10 +50,6 @@ test_script: - pytest --pyargs imblearn --cov-report term-missing --cov=imblearn after_test: - - if not exist dist mkdir dist - - if exist .coverage (cp .coverage dist\) else (echo no .coverage) + - cp .coverage %APPVEYOR_BUILD_FOLDER% + - cd %APPVEYOR_BUILD_FOLDER% - codecov - - if exist coverage.xml (cp coverage.xml dist\) else (echo no coverage.xml) - -artifacts: - - path: dist\* From b2c560a078e06dff1491f1589da23487affc6878 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Apr 2018 13:21:35 +0200 Subject: [PATCH 17/50] remove useless statement --- imblearn/keras/tests/test_generator.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index ec1650b00..31bdd1478 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -31,14 +31,8 @@ def _build_keras_model(n_classes, n_features): def test_balanced_batch_generator_class_no_return_indices(): - model = _build_keras_model(y.shape[1], X.shape[1]) with pytest.raises(ValueError, match='needs to return the indices'): - training_generator = BalancedBatchGenerator(X, y, - sampler=ClusterCentroids(), - batch_size=10, - random_state=42) - model.fit_generator(generator=training_generator, - epochs=10) + BalancedBatchGenerator(X, y, sampler=ClusterCentroids(), batch_size=10) @pytest.mark.parametrize( @@ -59,12 +53,9 @@ def test_balanced_batch_generator_class(sampler, sample_weight): def test_balanced_batch_generator_function_no_return_indices(): - model = _build_keras_model(y.shape[1], X.shape[1]) with pytest.raises(ValueError, match='needs to return the indices'): - training_generator, sample_per_epoch = balanced_batch_generator( + balanced_batch_generator( X, y, sampler=ClusterCentroids(), batch_size=10, random_state=42) - model.fit_generator(generator=training_generator, - epochs=10) @pytest.mark.parametrize( From f7aa74538d6ce474d012b6b2fb8b084e7cb7d888 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 9 May 2018 19:15:44 +0200 Subject: [PATCH 18/50] fix --- conftest.py | 1 + imblearn/keras/generator.py | 15 ++++++--------- imblearn/keras/tests/test_generator.py | 1 - 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/conftest.py b/conftest.py index 110fdd479..31da29707 100644 --- a/conftest.py +++ b/conftest.py @@ -8,6 +8,7 @@ # Set numpy array str/repr to legacy behaviour on numpy > 1.13 to make # the doctests pass import numpy as np + try: np.set_printoptions(legacy='1.13') except TypeError: diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py index b49489ac3..0ac16941b 100644 --- a/imblearn/keras/generator.py +++ b/imblearn/keras/generator.py @@ -1,14 +1,7 @@ """Implement generators for ``keras`` which will balance the data.""" from __future__ import division -try: - import keras -except ImportError: - # Skip the tests for the examples - import pytest - keras = pytest.importorskip('keras') - raise ImportError("To use the imblearn.keras module, you need to install " - "keras.") +import pytest from sklearn.base import clone from sklearn.utils import safe_indexing @@ -16,9 +9,13 @@ from sklearn.utils.testing import set_random_state from ..under_sampling import RandomUnderSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +keras = pytest.importorskip("keras") -# FIXME: add docstring for random_state using Substitution + +@Substitution(random_state=_random_state_docstring) class BalancedBatchGenerator(keras.utils.Sequence): """Create balanced batches when training a keras model. diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index 31bdd1478..d1107d2e5 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -3,7 +3,6 @@ import numpy as np keras = pytest.importorskip('keras') - from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical From b62fef818e8eef9be64a4f515c11a834a2b45a8f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 9 May 2018 21:34:34 +0200 Subject: [PATCH 19/50] iter --- imblearn/keras/generator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py index 0ac16941b..31f124163 100644 --- a/imblearn/keras/generator.py +++ b/imblearn/keras/generator.py @@ -15,7 +15,6 @@ keras = pytest.importorskip("keras") -@Substitution(random_state=_random_state_docstring) class BalancedBatchGenerator(keras.utils.Sequence): """Create balanced batches when training a keras model. @@ -41,7 +40,14 @@ class BalancedBatchGenerator(keras.utils.Sequence): batch_size : int, optional (default=32) Number of samples per gradient update. - {random_state} + random_state : int, RandomState instance or None, optional (default=None) + Control the randomization of the algorithm + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random number + generator; + - If ``None``, the random number generator is the ``RandomState`` + instance used by ``np.random``. Attributes ---------- @@ -127,6 +133,7 @@ def __getitem__(self, index): ) +@Substitution(random_state=_random_state_docstring) def balanced_batch_generator(X, y, sample_weight=None, sampler=None, batch_size=32, random_state=None): """Create a balanced batch generator to train keras model. From 79a4a10c72801c483f109280f82bcf211cf0658f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 9 May 2018 22:29:08 +0200 Subject: [PATCH 20/50] FIX modify docstring to accept substituion --- imblearn/keras/generator.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py index 31f124163..9f5e841c3 100644 --- a/imblearn/keras/generator.py +++ b/imblearn/keras/generator.py @@ -15,6 +15,7 @@ keras = pytest.importorskip("keras") +@Substitution(random_state=_random_state_docstring) class BalancedBatchGenerator(keras.utils.Sequence): """Create balanced batches when training a keras model. @@ -40,14 +41,7 @@ class BalancedBatchGenerator(keras.utils.Sequence): batch_size : int, optional (default=32) Number of samples per gradient update. - random_state : int, RandomState instance or None, optional (default=None) - Control the randomization of the algorithm - - If int, ``random_state`` is the seed used by the random number - generator; - - If ``RandomState`` instance, random_state is the random number - generator; - - If ``None``, the random number generator is the ``RandomState`` - instance used by ``np.random``. + {random_state} Attributes ---------- @@ -62,7 +56,7 @@ class BalancedBatchGenerator(keras.utils.Sequence): >>> from sklearn.datasets import load_iris >>> iris = load_iris() >>> from imblearn.datasets import make_imbalance - >>> X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) + >>> X, y = make_imbalance(iris.data, iris.target, {{0: 30, 1: 50, 2: 40}}) >>> y = keras.utils.to_categorical(y, 3) >>> import keras >>> model = keras.models.Sequential() @@ -177,7 +171,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, >>> from sklearn.datasets import load_iris >>> iris = load_iris() >>> from imblearn.datasets import make_imbalance - >>> X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) + >>> X, y = make_imbalance(iris.data, iris.target, {{0: 30, 1: 50, 2: 40}}) >>> y = keras.utils.to_categorical(y, 3) >>> import keras >>> model = keras.models.Sequential() From 162cb95e1fa77e6cc723e8fb945fd2d4f633638c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 9 May 2018 22:45:04 +0200 Subject: [PATCH 21/50] FIX do not substitue inside the class --- imblearn/keras/generator.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py index 9f5e841c3..df5e2f4a8 100644 --- a/imblearn/keras/generator.py +++ b/imblearn/keras/generator.py @@ -15,7 +15,6 @@ keras = pytest.importorskip("keras") -@Substitution(random_state=_random_state_docstring) class BalancedBatchGenerator(keras.utils.Sequence): """Create balanced batches when training a keras model. @@ -41,7 +40,14 @@ class BalancedBatchGenerator(keras.utils.Sequence): batch_size : int, optional (default=32) Number of samples per gradient update. - {random_state} + random_state : int, RandomState instance or None, optional (default=None) + Control the randomization of the algorithm + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random number + generator; + - If ``None``, the random number generator is the ``RandomState`` + instance used by ``np.random``. Attributes ---------- From 06955e251c5ffebe7986cda275181696d7bf991a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 9 May 2018 23:26:46 +0200 Subject: [PATCH 22/50] EHN add tensorflow sequence --- imblearn/__init__.py | 3 + imblearn/keras/__init__.py | 4 +- imblearn/keras/_generator.py | 111 ++++++++++ imblearn/keras/generator.py | 230 -------------------- imblearn/tensorflow/__init__.py | 6 + imblearn/tensorflow/_generator.py | 87 ++++++++ imblearn/tensorflow/tests/test_generator.py | 14 ++ 7 files changed, 223 insertions(+), 232 deletions(-) create mode 100644 imblearn/keras/_generator.py delete mode 100644 imblearn/keras/generator.py create mode 100644 imblearn/tensorflow/__init__.py create mode 100644 imblearn/tensorflow/_generator.py create mode 100644 imblearn/tensorflow/tests/test_generator.py diff --git a/imblearn/__init__.py b/imblearn/__init__.py index 7803ca016..0cb3ca8fe 100644 --- a/imblearn/__init__.py +++ b/imblearn/__init__.py @@ -21,6 +21,9 @@ with imbalanced dataset. over_sampling Module which provides methods to under-sample a dataset. +tensorflow + Module which provides custom generator, layers for deep learning using + tensorflow. under-sampling Module which provides methods to over-sample a dataset. utils diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py index 8acdd2b03..99b91f77f 100644 --- a/imblearn/keras/__init__.py +++ b/imblearn/keras/__init__.py @@ -1,8 +1,8 @@ """The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset in keras.""" -from .generator import BalancedBatchGenerator -from .generator import balanced_batch_generator +from ._generator import BalancedBatchGenerator +from ..tensorflow._generator import balanced_batch_generator __all__ = ['BalancedBatchGenerator', 'balanced_batch_generator'] diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py new file mode 100644 index 000000000..d655b926a --- /dev/null +++ b/imblearn/keras/_generator.py @@ -0,0 +1,111 @@ +"""Implement generators for ``keras`` which will balance the data.""" +from __future__ import division + +import pytest + +from sklearn.base import clone +from sklearn.utils import safe_indexing +from sklearn.utils import check_random_state +from sklearn.utils.testing import set_random_state + +from ..under_sampling import RandomUnderSampler + +keras = pytest.importorskip("keras") + + +class BalancedBatchGenerator(keras.utils.Sequence): + """Create balanced batches when training a keras model. + + Create a keras ``Sequence`` which is given to ``fit_generator``. The + sampler defines the sampling strategy used to balance the dataset ahead of + creating the batch. The sampler should have an attribute + ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=None) + A sampler instance which has an attribute ``return_indices``. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + random_state : int, RandomState instance or None, optional (default=None) + Control the randomization of the algorithm + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random number + generator; + - If ``None``, the random number generator is the ``RandomState`` + instance used by ``np.random``. + + Attributes + ---------- + sampler_ : object + The sampler used to balance the dataset. + + indices_ : ndarray, shape (n_samples, n_features) + The indices of the samples selected during sampling. + + """ + def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, + random_state=None): + self.X = X + self.y = y + self.sample_weight = sample_weight + self.sampler = sampler + self.batch_size = batch_size + self.random_state = random_state + self._sample() + + def _sample(self): + random_state = check_random_state(self.random_state) + if self.sampler is None: + self.sampler_ = RandomUnderSampler(return_indices=True, + random_state=random_state) + else: + if not hasattr(self.sampler, 'return_indices'): + raise ValueError("'sampler' needs to return the indices of " + "the samples selected. Provide a sampler " + "which has an attribute 'return_indices'.") + self.sampler_ = clone(self.sampler) + self.sampler_.set_params(return_indices=True) + set_random_state(self.sampler_, random_state) + + _, _, self.indices_ = self.sampler_.fit_sample(self.X, self.y) + # shuffle the indices since the sampler are packing them by class + random_state.shuffle(self.indices_) + + def __len__(self): + return int(self.indices_.size // self.batch_size) + + def __getitem__(self, index): + if self.sample_weight is None: + return ( + safe_indexing(self.X, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]), + safe_indexing(self.y, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + ) + else: + return ( + safe_indexing(self.X, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]), + safe_indexing(self.y, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]), + safe_indexing(self.sample_weight, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + ) diff --git a/imblearn/keras/generator.py b/imblearn/keras/generator.py deleted file mode 100644 index df5e2f4a8..000000000 --- a/imblearn/keras/generator.py +++ /dev/null @@ -1,230 +0,0 @@ -"""Implement generators for ``keras`` which will balance the data.""" -from __future__ import division - -import pytest - -from sklearn.base import clone -from sklearn.utils import safe_indexing -from sklearn.utils import check_random_state -from sklearn.utils.testing import set_random_state - -from ..under_sampling import RandomUnderSampler -from ..utils import Substitution -from ..utils._docstring import _random_state_docstring - -keras = pytest.importorskip("keras") - - -class BalancedBatchGenerator(keras.utils.Sequence): - """Create balanced batches when training a keras model. - - Create a keras ``Sequence`` which is given to ``fit_generator``. The - sampler defines the sampling strategy used to balance the dataset ahead of - creating the batch. The sampler should have an attribute - ``return_indices``. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Original imbalanced dataset. - - y : ndarray, shape (n_samples,) or (n_samples, n_classes) - Associated targets. - - sample_weight : ndarray, shape (n_samples,) - Sample weight. - - sampler : object or None, optional (default=None) - A sampler instance which has an attribute ``return_indices``. - - batch_size : int, optional (default=32) - Number of samples per gradient update. - - random_state : int, RandomState instance or None, optional (default=None) - Control the randomization of the algorithm - - If int, ``random_state`` is the seed used by the random number - generator; - - If ``RandomState`` instance, random_state is the random number - generator; - - If ``None``, the random number generator is the ``RandomState`` - instance used by ``np.random``. - - Attributes - ---------- - sampler_ : object - The sampler used to balance the dataset. - - indices_ : ndarray, shape (n_samples, n_features) - The indices of the samples selected during sampling. - - Examples - -------- - >>> from sklearn.datasets import load_iris - >>> iris = load_iris() - >>> from imblearn.datasets import make_imbalance - >>> X, y = make_imbalance(iris.data, iris.target, {{0: 30, 1: 50, 2: 40}}) - >>> y = keras.utils.to_categorical(y, 3) - >>> import keras - >>> model = keras.models.Sequential() - >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], - ... activation='softmax')) - >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', - ... metrics=['accuracy']) - >>> from imblearn.keras import BalancedBatchGenerator - >>> from imblearn.under_sampling import NearMiss - >>> training_generator = BalancedBatchGenerator( - ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) - >>> callback_history = model.fit_generator(generator=training_generator, - ... epochs=10, verbose=0) - - """ - def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, - random_state=None): - self.X = X - self.y = y - self.sample_weight = sample_weight - self.sampler = sampler - self.batch_size = batch_size - self.random_state = random_state - self._sample() - - def _sample(self): - random_state = check_random_state(self.random_state) - if self.sampler is None: - self.sampler_ = RandomUnderSampler(return_indices=True, - random_state=random_state) - else: - if not hasattr(self.sampler, 'return_indices'): - raise ValueError("'sampler' needs to return the indices of " - "the samples selected. Provide a sampler " - "which has an attribute 'return_indices'.") - self.sampler_ = clone(self.sampler) - self.sampler_.set_params(return_indices=True) - set_random_state(self.sampler_, random_state) - - _, _, self.indices_ = self.sampler_.fit_sample(self.X, self.y) - # shuffle the indices since the sampler are packing them by class - random_state.shuffle(self.indices_) - - def __len__(self): - return int(self.indices_.size // self.batch_size) - - def __getitem__(self, index): - if self.sample_weight is None: - return ( - safe_indexing(self.X, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]), - safe_indexing(self.y, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]) - ) - else: - return ( - safe_indexing(self.X, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]), - safe_indexing(self.y, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]), - safe_indexing(self.sample_weight, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]) - ) - - -@Substitution(random_state=_random_state_docstring) -def balanced_batch_generator(X, y, sample_weight=None, sampler=None, - batch_size=32, random_state=None): - """Create a balanced batch generator to train keras model. - - Returns a generator --- as well as the number of step per epoch --- which - is given to ``fit_generator``. The sampler defines the sampling strategy - used to balance the dataset ahead of creating the batch. The sampler should - have an attribute ``return_indices``. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Original imbalanced dataset. - - y : ndarray, shape (n_samples,) or (n_samples, n_classes) - Associated targets. - - sample_weight : ndarray, shape (n_samples,) - Sample weight. - - sampler : object or None, optional (default=None) - A sampler instance which has an attribute ``return_indices``. - - batch_size : int, optional (default=32) - Number of samples per gradient update. - - {random_state} - - Returns - ------- - generator : generator of tuple - Generate batch of data. The tuple generated are either (X_batch, - y_batch) or (X_batch, y_batch, sampler_weight_batch). - - steps_per_epoch : int - The number of samples per epoch. Required by ``fit_generator`` in - keras. - - Examples - -------- - >>> from sklearn.datasets import load_iris - >>> iris = load_iris() - >>> from imblearn.datasets import make_imbalance - >>> X, y = make_imbalance(iris.data, iris.target, {{0: 30, 1: 50, 2: 40}}) - >>> y = keras.utils.to_categorical(y, 3) - >>> import keras - >>> model = keras.models.Sequential() - >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], - ... activation='softmax')) - >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', - ... metrics=['accuracy']) - >>> from imblearn.keras import balanced_batch_generator - >>> from imblearn.under_sampling import NearMiss - >>> training_generator, steps_per_epoch = balanced_batch_generator( - ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) - >>> callback_history = model.fit_generator(generator=training_generator, - ... steps_per_epoch=steps_per_epoch, - ... epochs=10, verbose=0) - - - """ - random_state = check_random_state(random_state) - if sampler is None: - sampler_ = RandomUnderSampler(return_indices=True, - random_state=random_state) - else: - if not hasattr(sampler, 'return_indices'): - raise ValueError("'sampler' needs to return the indices of " - "the samples selected. Provide a sampler " - "which has an attribute 'return_indices'.") - sampler_ = clone(sampler) - sampler_.set_params(return_indices=True) - set_random_state(sampler_, random_state) - - _, _, indices = sampler_.fit_sample(X, y) - # shuffle the indices since the sampler are packing them by class - random_state.shuffle(indices) - - def generator(X, y, sample_weight, indices, batch_size): - if sample_weight is None: - while True: - for index in range(0, len(indices), batch_size): - yield (safe_indexing(X, indices[index:index + batch_size]), - safe_indexing(y, indices[index:index + batch_size])) - else: - while True: - for index in range(0, len(indices), batch_size): - yield (safe_indexing(X, indices[index:index + batch_size]), - safe_indexing(y, indices[index:index + batch_size]), - safe_indexing(sample_weight, - indices[index:index + batch_size])) - - return (generator(X, y, sample_weight, indices, batch_size), - int(indices.size // batch_size)) diff --git a/imblearn/tensorflow/__init__.py b/imblearn/tensorflow/__init__.py new file mode 100644 index 000000000..3224a7db1 --- /dev/null +++ b/imblearn/tensorflow/__init__.py @@ -0,0 +1,6 @@ +"""The :mod:`imblearn.tensorflow` provides utilities to deal with imbalanced +dataset in tensorflow.""" + +from ._generator import balanced_batch_generator + +__all__ = ['balanced_batch_generator'] diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py new file mode 100644 index 000000000..f1704bbbb --- /dev/null +++ b/imblearn/tensorflow/_generator.py @@ -0,0 +1,87 @@ +"""Implement generators for ``tensorflow`` which will balance the data.""" + +from __future__ import division + +from sklearn.base import clone +from sklearn.utils import safe_indexing +from sklearn.utils import check_random_state +from sklearn.utils.testing import set_random_state + +from ..under_sampling import RandomUnderSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring + + +@Substitution(random_state=_random_state_docstring) +def balanced_batch_generator(X, y, sample_weight=None, sampler=None, + batch_size=32, random_state=None): + """Create a balanced batch generator to train keras model. + + Returns a generator --- as well as the number of step per epoch --- which + is given to ``fit_generator``. The sampler defines the sampling strategy + used to balance the dataset ahead of creating the batch. The sampler should + have an attribute ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=None) + A sampler instance which has an attribute ``return_indices``. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + {random_state} + + Returns + ------- + generator : generator of tuple + Generate batch of data. The tuple generated are either (X_batch, + y_batch) or (X_batch, y_batch, sampler_weight_batch). + + steps_per_epoch : int + The number of samples per epoch. Required by ``fit_generator`` in + keras. + + """ + random_state = check_random_state(random_state) + if sampler is None: + sampler_ = RandomUnderSampler(return_indices=True, + random_state=random_state) + else: + if not hasattr(sampler, 'return_indices'): + raise ValueError("'sampler' needs to return the indices of " + "the samples selected. Provide a sampler " + "which has an attribute 'return_indices'.") + sampler_ = clone(sampler) + sampler_.set_params(return_indices=True) + set_random_state(sampler_, random_state) + + _, _, indices = sampler_.fit_sample(X, y) + # shuffle the indices since the sampler are packing them by class + random_state.shuffle(indices) + + def generator(X, y, sample_weight, indices, batch_size): + if sample_weight is None: + while True: + for index in range(0, len(indices), batch_size): + yield (safe_indexing(X, indices[index:index + batch_size]), + safe_indexing(y, indices[index:index + batch_size])) + else: + while True: + for index in range(0, len(indices), batch_size): + yield (safe_indexing(X, indices[index:index + batch_size]), + safe_indexing(y, indices[index:index + batch_size]), + safe_indexing(sample_weight, + indices[index:index + batch_size])) + + return (generator(X, y, sample_weight, indices, batch_size), + int(indices.size // batch_size)) diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py new file mode 100644 index 000000000..166fbb870 --- /dev/null +++ b/imblearn/tensorflow/tests/test_generator.py @@ -0,0 +1,14 @@ +import pytest + +tf = pytest.importorskip('tensforflow') + +from sklearn.datasets import load_iris + +from imblearn.datasets import make_imbalance +from imblearn.under_sampling import ClusterCentroids +from imblearn.under_sampling import NearMiss + +from imblearn.tensforflow import balanced_batch_generator + +iris = load_iris() +X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) From 0ce4a5c5621027181d77157242bb41a3f1415ec3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 11 May 2018 13:39:59 +0200 Subject: [PATCH 23/50] TST generator for tensorflow --- imblearn/keras/tests/test_generator.py | 10 +-- imblearn/tensorflow/tests/test_generator.py | 72 +++++++++++++++++++-- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index d1107d2e5..d94dd73da 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -2,11 +2,6 @@ import numpy as np -keras = pytest.importorskip('keras') -from keras.models import Sequential -from keras.layers import Dense -from keras.utils import to_categorical - from sklearn.datasets import load_iris from imblearn.datasets import make_imbalance @@ -16,6 +11,11 @@ from imblearn.keras import BalancedBatchGenerator from imblearn.keras import balanced_batch_generator +keras = pytest.importorskip('keras') +from keras.models import Sequential +from keras.layers import Dense +from keras.utils import to_categorical + iris = load_iris() X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) y = to_categorical(y, 3) diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py index 166fbb870..2045690e4 100644 --- a/imblearn/tensorflow/tests/test_generator.py +++ b/imblearn/tensorflow/tests/test_generator.py @@ -1,14 +1,74 @@ -import pytest +from __future__ import division -tf = pytest.importorskip('tensforflow') +import pytest +import numpy as np from sklearn.datasets import load_iris from imblearn.datasets import make_imbalance -from imblearn.under_sampling import ClusterCentroids from imblearn.under_sampling import NearMiss -from imblearn.tensforflow import balanced_batch_generator +from imblearn.tensorflow import balanced_batch_generator + +tf = pytest.importorskip('tensorflow') + + +@pytest.mark.parametrize( + "sampler", + [None, NearMiss()] +) +def test_balanced_batch_generator(sampler): + X, y = load_iris(return_X_y=True) + X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) + X = X.astype(np.float32) + + batch_size = 10 + training_generator, steps_per_epoch = balanced_batch_generator( + X, y, sample_weight=None, sampler=sampler, + batch_size=batch_size, random_state=42) + + learning_rate = 0.01 + epochs = 10 + input_size = X.shape[1] + output_size = 3 + + # helper functions + def init_weights(shape): + return tf.Variable(tf.random_normal(shape, stddev=0.01)) + + def accuracy(y_true, y_pred): + return np.mean(np.argmax(y_pred, axis=1) == y_true) + + # input and output + data = tf.placeholder("float32", shape=[None, input_size]) + targets = tf.placeholder("int32", shape=[None]) + + # build the model and weights + W = init_weights([input_size, output_size]) + b = init_weights([output_size]) + out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) + + # build the loss, predict, and train operator + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=out_act, labels=targets) + loss = tf.reduce_sum(cross_entropy) + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.minimize(loss) + predict = tf.nn.softmax(out_act) + + # Initialization of all variables in the graph + init = tf.global_variables_initializer() + + with tf.Session() as sess: + sess.run(init) + + for e in range(epochs): + for i in range(steps_per_epoch): + X_batch, y_batch = next(training_generator) + sess.run([train_op, loss], + feed_dict={data: X_batch, targets: y_batch}) -iris = load_iris() -X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) + # For each epoch, run accuracy on train and test + predicts_train = sess.run(predict, feed_dict={data: X}) + print("epoch: {} train accuracy: {:.3f}" + .format(e, accuracy(y, predicts_train))) From d2bc9ca9afc6a1c73d51df6711912bfcbf47005d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 May 2018 01:03:11 +0200 Subject: [PATCH 24/50] DOCAdd simple examples --- imblearn/keras/__init__.py | 67 +++++++++++++++++++++++++++++++ imblearn/keras/_generator.py | 25 ++++++++++++ imblearn/tensorflow/_generator.py | 57 ++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py index 99b91f77f..449a2193b 100644 --- a/imblearn/keras/__init__.py +++ b/imblearn/keras/__init__.py @@ -4,5 +4,72 @@ from ._generator import BalancedBatchGenerator from ..tensorflow._generator import balanced_batch_generator +balanced_batch_generator.__doc__ = \ + """Create a balanced batch generator to train keras model. + + Returns a generator --- as well as the number of step per epoch --- which + is given to ``fit_generator``. The sampler defines the sampling strategy + used to balance the dataset ahead of creating the batch. The sampler should + have an attribute ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=None) + A sampler instance which has an attribute ``return_indices``. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + random_state : int, RandomState instance or None, optional (default=None) + Control the randomization of the algorithm + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random number + generator; + - If ``None``, the random number generator is the ``RandomState`` + instance used by ``np.random``. + + Returns + ------- + generator : generator of tuple + Generate batch of data. The tuple generated are either (X_batch, + y_batch) or (X_batch, y_batch, sampler_weight_batch). + + steps_per_epoch : int + The number of samples per epoch. Required by ``fit_generator`` in + keras. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(iris.data, iris.target, {{0: 30, 1: 50, 2: 40}}) + >>> y = keras.utils.to_categorical(y, 3) + >>> import keras + >>> model = keras.models.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + >>> from imblearn.keras import balanced_batch_generator + >>> from imblearn.under_sampling import NearMiss + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... steps_per_epoch=steps_per_epoch, + ... epochs=10, verbose=0) + + """ + __all__ = ['BalancedBatchGenerator', 'balanced_batch_generator'] diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index d655b926a..51609cd1f 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -9,10 +9,13 @@ from sklearn.utils.testing import set_random_state from ..under_sampling import RandomUnderSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring keras = pytest.importorskip("keras") +@Substitution(random_state=_random_state_docstring) class BalancedBatchGenerator(keras.utils.Sequence): """Create balanced batches when training a keras model. @@ -55,6 +58,28 @@ class BalancedBatchGenerator(keras.utils.Sequence): indices_ : ndarray, shape (n_samples, n_features) The indices of the samples selected during sampling. + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> from imblearn.datasets import make_imbalance + >>> class_dict = dict() + >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 + >>> X, y = make_imbalance(iris.data, iris.target, class_dict) + >>> y = keras.utils.to_categorical(y, 3) + >>> import keras + >>> model = keras.models.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + >>> from imblearn.keras import BalancedBatchGenerator + >>> from imblearn.under_sampling import NearMiss + >>> training_generator = BalancedBatchGenerator( + ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... epochs=10, verbose=0) + """ def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, random_state=None): diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index f1704bbbb..e5ee1dca1 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -51,7 +51,64 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, The number of samples per epoch. Required by ``fit_generator`` in keras. + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> class_dict = dict() + >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(X, y, class_dict) + >>> X = X.astype(np.float32) + >>> batch_size, learning_rate, epochs = 10, 0.01, 10 + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sample_weight=None, sampler=None, + ... batch_size=batch_size, random_state=42) + >>> input_size, output_size = X.shape[1], 3 + >>> import tensorflow as tf + >>> def init_weights(shape): + ... return tf.Variable(tf.random_normal(shape, stddev=0.01)) + >>> def accuracy(y_true, y_pred): + ... return np.mean(np.argmax(y_pred, axis=1) == y_true) + >>> # input and output + >>> data = tf.placeholder("float32", shape=[None, input_size]) + >>> targets = tf.placeholder("int32", shape=[None]) + >>> # build the model and weights + >>> W = init_weights([input_size, output_size]) + >>> b = init_weights([output_size]) + >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) + >>> # build the loss, predict, and train operator + >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + ... logits=out_act, labels=targets) + >>> loss = tf.reduce_sum(cross_entropy) + >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate) + >>> train_op = optimizer.minimize(loss) + >>> predict = tf.nn.softmax(out_act) + >>> # Initialization of all variables in the graph + >>> init = tf.global_variables_initializer() + >>> with tf.Session() as sess: + ... print('Starting training') + ... sess.run(init) + ... for e in range(epochs): + ... for i in range(steps_per_epoch): + ... X_batch, y_batch = next(training_generator) + ... feed_dict = dict() + ... feed_dict[data] = X_batch; feed_dict[targets] = y_batch + ... sess.run([train_op, loss], feed_dict=feed_dict) + ... # For each epoch, run accuracy on train and test + ... feed_dict = dict() + ... feed_dict[data] = X + ... predicts_train = sess.run(predict, feed_dict=feed_dict) + ... print("epoch: {{}} train accuracy: {{:.3f}}" + ... .format(e, accuracy(y, predicts_train))) + ... # doctest: +ELLIPSIS + Starting training + [... + """ + + random_state = check_random_state(random_state) if sampler is None: sampler_ = RandomUnderSampler(return_indices=True, From 731622e23eaf3d8c03abc49da8577144cecad0bc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 May 2018 01:07:13 +0200 Subject: [PATCH 25/50] DOC add to api documentation --- doc/api.rst | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index f9566146f..3af9fe59e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -111,6 +111,41 @@ Prototype selection ensemble.BalancedBaggingClassifier ensemble.EasyEnsemble +.. _keras_ref: + +:mod:`imblearn.keras`: Batch generator for Keras +================================================ + +.. automodule:: imblearn.keras + :no-members: + :no-inherited-members: + +.. currentmodule:: imblearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + keras.BalancedBatchGenerator + keras.balanced_batch_generator + +.. _tensorflow_ref: + +:mod:`imblearn.tensorflow`: Batch generator for TensorFlow +========================================================== + +.. automodule:: imblearn.tensorflow + :no-members: + :no-inherited-members: + +.. currentmodule:: imblearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + tensorflow.balanced_batch_generator + .. _misc_ref: Miscellaneous From 0e8eafe3e4dba1177379adc985fadcd303334e70 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 May 2018 01:10:16 +0200 Subject: [PATCH 26/50] FIX add function summary --- doc/api.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 3af9fe59e..4abc49d33 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -127,6 +127,11 @@ Prototype selection :template: class.rst keras.BalancedBatchGenerator + +.. autosummary:: + :toctree: generated/ + :template: function.rst + keras.balanced_batch_generator .. _tensorflow_ref: @@ -142,8 +147,8 @@ Prototype selection .. autosummary:: :toctree: generated/ - :template: class.rst - + :template: function.rst + tensorflow.balanced_batch_generator .. _misc_ref: From b64e04f65d0fa19e8701cb170b6576c99e019617 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 May 2018 16:56:10 +0200 Subject: [PATCH 27/50] iter --- imblearn/keras/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py index 449a2193b..c4e81f4e3 100644 --- a/imblearn/keras/__init__.py +++ b/imblearn/keras/__init__.py @@ -51,11 +51,14 @@ Examples -------- >>> from sklearn.datasets import load_iris - >>> iris = load_iris() + >>> X, y = load_iris(return_X_y=True) >>> from imblearn.datasets import make_imbalance - >>> X, y = make_imbalance(iris.data, iris.target, {{0: 30, 1: 50, 2: 40}}) - >>> y = keras.utils.to_categorical(y, 3) + >>> class_dict = dict() + >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(X, y, class_dict) >>> import keras + >>> y = keras.utils.to_categorical(y, 3) >>> model = keras.models.Sequential() >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], ... activation='softmax')) From c59fb8b702966e7973ef8297e38efa3056a35b3e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 May 2018 21:39:44 +0200 Subject: [PATCH 28/50] Update docstring --- imblearn/keras/__init__.py | 72 +---------------------------- imblearn/keras/_generator.py | 75 ++++++++++++++++++++++++++++++- imblearn/tensorflow/_generator.py | 1 - imblearn/utils/_docstring.py | 3 +- 4 files changed, 76 insertions(+), 75 deletions(-) diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py index c4e81f4e3..407e0c7dd 100644 --- a/imblearn/keras/__init__.py +++ b/imblearn/keras/__init__.py @@ -2,77 +2,7 @@ in keras.""" from ._generator import BalancedBatchGenerator -from ..tensorflow._generator import balanced_batch_generator - -balanced_batch_generator.__doc__ = \ - """Create a balanced batch generator to train keras model. - - Returns a generator --- as well as the number of step per epoch --- which - is given to ``fit_generator``. The sampler defines the sampling strategy - used to balance the dataset ahead of creating the batch. The sampler should - have an attribute ``return_indices``. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Original imbalanced dataset. - - y : ndarray, shape (n_samples,) or (n_samples, n_classes) - Associated targets. - - sample_weight : ndarray, shape (n_samples,) - Sample weight. - - sampler : object or None, optional (default=None) - A sampler instance which has an attribute ``return_indices``. - - batch_size : int, optional (default=32) - Number of samples per gradient update. - - random_state : int, RandomState instance or None, optional (default=None) - Control the randomization of the algorithm - - If int, ``random_state`` is the seed used by the random number - generator; - - If ``RandomState`` instance, random_state is the random number - generator; - - If ``None``, the random number generator is the ``RandomState`` - instance used by ``np.random``. - - Returns - ------- - generator : generator of tuple - Generate batch of data. The tuple generated are either (X_batch, - y_batch) or (X_batch, y_batch, sampler_weight_batch). - - steps_per_epoch : int - The number of samples per epoch. Required by ``fit_generator`` in - keras. - - Examples - -------- - >>> from sklearn.datasets import load_iris - >>> X, y = load_iris(return_X_y=True) - >>> from imblearn.datasets import make_imbalance - >>> class_dict = dict() - >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 - >>> from imblearn.datasets import make_imbalance - >>> X, y = make_imbalance(X, y, class_dict) - >>> import keras - >>> y = keras.utils.to_categorical(y, 3) - >>> model = keras.models.Sequential() - >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], - ... activation='softmax')) - >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', - ... metrics=['accuracy']) - >>> from imblearn.keras import balanced_batch_generator - >>> from imblearn.under_sampling import NearMiss - >>> training_generator, steps_per_epoch = balanced_batch_generator( - ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) - >>> callback_history = model.fit_generator(generator=training_generator, - ... steps_per_epoch=steps_per_epoch, - ... epochs=10, verbose=0) - - """ +from ._generator import balanced_batch_generator __all__ = ['BalancedBatchGenerator', 'balanced_batch_generator'] diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 51609cd1f..e46df80bd 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -8,11 +8,12 @@ from sklearn.utils import check_random_state from sklearn.utils.testing import set_random_state +keras = pytest.importorskip("keras") + from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring - -keras = pytest.importorskip("keras") +from ..tensorflow import balanced_batch_generator as keras_bbg @Substitution(random_state=_random_state_docstring) @@ -134,3 +135,73 @@ def __getitem__(self, index): self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) ) + + +@Substitution(random_state=_random_state_docstring) +def balanced_batch_generator(X, y, sample_weight=None, sampler=None, + batch_size=32, random_state=None): + """Create a balanced batch generator to train keras model. + + Returns a generator --- as well as the number of step per epoch --- which + is given to ``fit_generator``. The sampler defines the sampling strategy + used to balance the dataset ahead of creating the batch. The sampler should + have an attribute ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=None) + A sampler instance which has an attribute ``return_indices``. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + {random_state} + + Returns + ------- + generator : generator of tuple + Generate batch of data. The tuple generated are either (X_batch, + y_batch) or (X_batch, y_batch, sampler_weight_batch). + + steps_per_epoch : int + The number of samples per epoch. Required by ``fit_generator`` in + keras. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> from imblearn.datasets import make_imbalance + >>> class_dict = dict() + >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(X, y, class_dict) + >>> import keras + >>> y = keras.utils.to_categorical(y, 3) + >>> model = keras.models.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + >>> from imblearn.keras import balanced_batch_generator + >>> from imblearn.under_sampling import NearMiss + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... steps_per_epoch=steps_per_epoch, + ... epochs=10, verbose=0) + + """ + + return keras_bbg(X=X, y=y, sample_weight=sample_weight, + sampler=sampler, batch_size=batch_size, + random_state=random_state) diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index e5ee1dca1..868dd1a89 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -108,7 +108,6 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, """ - random_state = check_random_state(random_state) if sampler is None: sampler_ = RandomUnderSampler(return_indices=True, diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py index 56ae44106..f036f31da 100644 --- a/imblearn/utils/_docstring.py +++ b/imblearn/utils/_docstring.py @@ -25,7 +25,8 @@ def __call__(self, obj): _random_state_docstring = \ """random_state : int, RandomState instance or None, optional (default=None) - Control the randomization of the algorithm + Control the randomization of the algorithm. + - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number From 155fe0f8b0407636da50463ab33326f9c6859ff8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 14 May 2018 23:52:36 +0200 Subject: [PATCH 29/50] DOC fix warning (#425) --- doc/whats_new.rst | 10 -- doc/whats_new/v0.0.1.rst | 4 +- doc/whats_new/v0.0.2.rst | 152 +++++++++++++---- doc/whats_new/v0.0.3.rst | 69 ++++---- doc/whats_new/v0.0.4.rst | 2 +- examples/plot_sampling_target_usage.py | 220 ------------------------- imblearn/metrics/classification.py | 4 +- imblearn/utils/_docstring.py | 1 + imblearn/utils/validation.py | 2 +- 9 files changed, 167 insertions(+), 297 deletions(-) delete mode 100644 examples/plot_sampling_target_usage.py diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 6a5912887..ba3d4d584 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -11,13 +11,3 @@ Release history .. include:: whats_new/v0.0.2.rst .. include:: whats_new/v0.0.1.rst - -.. _Guillaume Lemaitre: https://github.com/glemaitre -.. _Christos Aridas: https://github.com/chkoar -.. _Fernando Nogueira: https://github.com/fmfn -.. _Dayvid Oliveira: https://github.com/dvro -.. _Francois Magimel: https://github.com/Linkid -.. _Aliaksei Halachkin: https://github.com/honeyext -.. _Aleksandr Loskutov: https://github.com/loskutyan -.. _Rafael Wampfler: https://github.com/Eichhof -.. _Joan Massich: https://github.com/massich diff --git a/doc/whats_new/v0.0.1.rst b/doc/whats_new/v0.0.1.rst index f450c8d28..22da3a468 100644 --- a/doc/whats_new/v0.0.1.rst +++ b/doc/whats_new/v0.0.1.rst @@ -9,7 +9,9 @@ Changelog API ~~~ -- First release of the stable API. By `Fernando Nogueira`_, `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_. +- First release of the stable API. By :user;`Fernando Nogueira `, + :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, + and :user:`Dayvid Oliveira `. New methods ~~~~~~~~~~~ diff --git a/doc/whats_new/v0.0.2.rst b/doc/whats_new/v0.0.2.rst index cc3c7d1e3..60f64e92c 100644 --- a/doc/whats_new/v0.0.2.rst +++ b/doc/whats_new/v0.0.2.rst @@ -9,51 +9,141 @@ Changelog Bug fixes ~~~~~~~~~ -- Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the right samples during under sampling for the method 3. By `Guillaume Lemaitre`_. -- Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the `random_state` generation. By `Guillaume Lemaitre`_ and `Christos Aridas`_. -- Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add additional stopping criterion to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_. -- Fixed a bug in :class:`under_sampling.AllKNN`, add stopping criteria to avoid that the minority class become a majority class or that a class disappear. By `Guillaume Lemaitre`_. -- Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction of the list of indices returned. By `Guillaume Lemaitre`_. -- Fixed a bug in :class:`ensemble.BalanceCascade`, solve the issue to obtain a single array if desired. By `Guillaume Lemaitre`_. -- Fixed a bug in :class:`pipeline.Pipeline`, solve to embed `Pipeline` in other `Pipeline`. :issue:`231` by `Christos Aridas`_ . -- Fixed a bug in :class:`pipeline.Pipeline`, solve the issue to put to sampler in the same `Pipeline`. :issue:`188` by `Christos Aridas`_ . -- Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction of the shape of `sel_x` when only one sample is selected. By `Aliaksei Halachkin`_. -- Fixed a bug in :class:`under_sampling.NeighbourhoodCleaningRule`, selecting neighbours instead of minority class misclassified samples. :issue:`230` by `Aleksandr Loskutov`_. -- Fixed a bug in :class:`over_sampling.ADASYN`, correction of the creation of a new sample so that the new sample lies between the minority sample and the nearest neighbour. :issue:`235` by `Rafael Wampfler`_. +- Fixed a bug in :class:`under_sampling.NearMiss` which was not picking the + right samples during under sampling for the method 3. By :user:`Guillaume + Lemaitre `. + +- Fixed a bug in :class:`ensemble.EasyEnsemble`, correction of the + `random_state` generation. By :user:`Guillaume Lemaitre ` and + :user:`Christos Aridas `. + +- Fixed a bug in :class:`under_sampling.RepeatedEditedNearestNeighbours`, add + additional stopping criterion to avoid that the minority class become a + majority class or that a class disappear. By :user:`Guillaume Lemaitre + `. + +- Fixed a bug in :class:`under_sampling.AllKNN`, add stopping criteria to avoid + that the minority class become a majority class or that a class disappear. By + :user:`Guillaume Lemaitre `. + +- Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction + of the list of indices returned. By :user:`Guillaume Lemaitre `. + +- Fixed a bug in :class:`ensemble.BalanceCascade`, solve the issue to obtain a + single array if desired. By :user:`Guillaume Lemaitre `. + +- Fixed a bug in :class:`pipeline.Pipeline`, solve to embed `Pipeline` in other + `Pipeline`. :issue:`231` by :user:`Christos Aridas `. + +- Fixed a bug in :class:`pipeline.Pipeline`, solve the issue to put to sampler + in the same `Pipeline`. :issue:`188` by :user:`Christos Aridas `. + +- Fixed a bug in :class:`under_sampling.CondensedNeareastNeigbour`, correction + of the shape of `sel_x` when only one sample is selected. By + :user:`Aliaksei Halachkin `. + +- Fixed a bug in :class:`under_sampling.NeighbourhoodCleaningRule`, selecting + neighbours instead of minority class misclassified samples. :issue:`230` by + :user:`Aleksandr Loskutov `. + +- Fixed a bug in :class:`over_sampling.ADASYN`, correction of the creation of a + new sample so that the new sample lies between the minority sample and the + nearest neighbour. :issue:`235` by :user:`Rafael Wampfler `. New features ~~~~~~~~~~~~ -- Added AllKNN under sampling technique. By `Dayvid Oliveira`_. -- Added a module `metrics` implementing some specific scoring function for the problem of balancing. :issue:`204` by `Guillaume Lemaitre`_ and `Christos Aridas`_. +- Added AllKNN under sampling technique. By :user:`Dayvid Oliveira `. + +- Added a module `metrics` implementing some specific scoring function for the + problem of balancing. :issue:`204` by :user:`Guillaume Lemaitre ` + and :user:`Christos Aridas `. Enhancement ~~~~~~~~~~~ -- Added support for bumpversion. By `Guillaume Lemaitre`_. -- Validate the type of target in binary samplers. A warning is raised for the moment. By `Guillaume Lemaitre`_ and `Christos Aridas`_. +- Added support for bumpversion. By :user:`Guillaume Lemaitre `. + +- Validate the type of target in binary samplers. A warning is raised for the + moment. By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas + `. + - Change from `cross_validation` module to `model_selection` module for - `sklearn` deprecation cycle. By `Dayvid Oliveira`_ and `Christos Aridas`_. + `sklearn` deprecation cycle. By :user:`Dayvid Oliveira ` and + :user:`Christos Aridas `. API changes summary ~~~~~~~~~~~~~~~~~~~ -- `size_ngh` has been deprecated in :class:`combine.SMOTEENN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira` . -- `size_ngh` has been deprecated in :class:`under_sampling.EditedNearestNeighbors`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_. -- `size_ngh` has been deprecated in :class:`under_sampling.CondensedNeareastNeigbour`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_. -- `size_ngh` has been deprecated in :class:`under_sampling.OneSidedSelection`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_. -- `size_ngh` has been deprecated in :class:`under_sampling.NeighbourhoodCleaningRule`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_. -- `size_ngh` has been deprecated in :class:`under_sampling.RepeatedEditedNearestNeighbours`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_. -- `size_ngh` has been deprecated in :class:`under_sampling.AllKNN`. Use `n_neighbors` instead. By `Guillaume Lemaitre`_, `Christos Aridas`_, and `Dayvid Oliveira`_. -- Two base classes :class:`BaseBinaryclassSampler` and :class:`BaseMulticlassSampler` have been created to handle the target type and raise warning in case of abnormality. By `Guillaume Lemaitre`_ and `Christos Aridas`_. -- Move `random_state` to be assigned in the :class:`SamplerMixin` initialization. By `Guillaume Lemaitre`_. -- Provide estimators instead of parameters in :class:`combine.SMOTEENN` and :class:`combine.SMOTETomek`. Therefore, the list of parameters have been deprecated. By `Guillaume Lemaitre`_ and `Christos Aridas`_. -- `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` instead. :issue:`183` by `Guillaume Lemaitre`_. -- `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use `k_neighbors` and `m_neighbors` instead. :issue:`182` by `Guillaume Lemaitre`_. -- `n_neighbors` accept `KNeighborsMixin` based object for :class:`under_sampling.EditedNearestNeighbors`, :class:`under_sampling.CondensedNeareastNeigbour`, :class:`under_sampling.NeighbourhoodCleaningRule`, :class:`under_sampling.RepeatedEditedNearestNeighbours`, and :class:`under_sampling.AllKNN`. :issue:`109` by `Guillaume Lemaitre`_. +- `size_ngh` has been deprecated in :class:`combine.SMOTEENN`. Use + `n_neighbors` instead. By :user:`Guillaume Lemaitre `, + :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. + +- `size_ngh` has been deprecated in + :class:`under_sampling.EditedNearestNeighbors`. Use `n_neighbors` instead. By + :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, + and :user:`Dayvid Oliveira `. + +- `size_ngh` has been deprecated in + :class:`under_sampling.CondensedNeareastNeigbour`. Use `n_neighbors` + instead. By :user:`Guillaume Lemaitre `, + :user:`Christos Aridas `, and + :user:`Dayvid Oliveira `. + +- `size_ngh` has been deprecated in + :class:`under_sampling.OneSidedSelection`. Use `n_neighbors` instead. By + :user:`Guillaume Lemaitre `, :user:`Christos Aridas `, + and :user:`Dayvid Oliveira `. + +- `size_ngh` has been deprecated in + :class:`under_sampling.NeighbourhoodCleaningRule`. Use `n_neighbors` + instead. By :user:`Guillaume Lemaitre `, + :user:`Christos Aridas `, and + :user:`Dayvid Oliveira `. + +- `size_ngh` has been deprecated in + :class:`under_sampling.RepeatedEditedNearestNeighbours`. Use `n_neighbors` + instead. By :user:`Guillaume Lemaitre `, + :user:`Christos Aridas `, and + :user:`Dayvid Oliveira `. + +- `size_ngh` has been deprecated in :class:`under_sampling.AllKNN`. Use + `n_neighbors` instead. By :user:`Guillaume Lemaitre `, + :user:`Christos Aridas `, and :user:`Dayvid Oliveira `. + +- Two base classes :class:`BaseBinaryclassSampler` and + :class:`BaseMulticlassSampler` have been created to handle the target type + and raise warning in case of abnormality. + By :user:`Guillaume Lemaitre ` and :user:`Christos Aridas `. + +- Move `random_state` to be assigned in the :class:`SamplerMixin` + initialization. By :user:`Guillaume Lemaitre `. + +- Provide estimators instead of parameters in :class:`combine.SMOTEENN` and + :class:`combine.SMOTETomek`. Therefore, the list of parameters have been + deprecated. By :user:`Guillaume Lemaitre ` and + :user:`Christos Aridas `. + +- `k` has been deprecated in :class:`over_sampling.ADASYN`. Use `n_neighbors` + instead. :issue:`183` by :user:`Guillaume Lemaitre `. + +- `k` and `m` have been deprecated in :class:`over_sampling.SMOTE`. Use + `k_neighbors` and `m_neighbors` instead. :issue:`182` by :user:`Guillaume + Lemaitre `. + +- `n_neighbors` accept `KNeighborsMixin` based object for + :class:`under_sampling.EditedNearestNeighbors`, + :class:`under_sampling.CondensedNeareastNeigbour`, + :class:`under_sampling.NeighbourhoodCleaningRule`, + :class:`under_sampling.RepeatedEditedNearestNeighbours`, and + :class:`under_sampling.AllKNN`. :issue:`109` by :user:`Guillaume Lemaitre + `. Documentation changes ~~~~~~~~~~~~~~~~~~~~~ -- Replace some remaining `UnbalancedDataset` occurences. By `Francois Magimel`_. -- Added doctest in the documentation. By `Guillaume Lemaitre`_. +- Replace some remaining `UnbalancedDataset` occurences. + By :user:`Francois Magimel `. + +- Added doctest in the documentation. By :user:`Guillaume Lemaitre + `. diff --git a/doc/whats_new/v0.0.3.rst b/doc/whats_new/v0.0.3.rst index c9b69bc28..9232db6a7 100644 --- a/doc/whats_new/v0.0.3.rst +++ b/doc/whats_new/v0.0.3.rst @@ -8,85 +8,92 @@ Changelog Testing ~~~~~~~ -- Pytest is used instead of nosetests. :issue:`321` by `Joan Massich`_. +- Pytest is used instead of nosetests. :issue:`321` by :user:`Joan Massich + `. Documentation ~~~~~~~~~~~~~ -- Added a User Guide and extended some examples. :issue:`295` by `Guillaume Lemaitre`_. +- Added a User Guide and extended some examples. :issue:`295` by + :user:`Guillaume Lemaitre `. Bug fixes ~~~~~~~~~ - Fixed a bug in :func:`utils.check_ratio` such that an error is raised when - the number of samples required is negative. :issue:`312` by `Guillaume Lemaitre`_. + the number of samples required is negative. :issue:`312` by :user:`Guillaume + Lemaitre `. -- Fixed a bug in :class:`under_sampling.NearMiss` version 3. The - indices returned were wrong. :issue:`312` by `Guillaume Lemaitre`_. +- Fixed a bug in :class:`under_sampling.NearMiss` version 3. The indices + returned were wrong. :issue:`312` by :user:`Guillaume Lemaitre `. - Fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN` - and :class:`SMOTETomek`. :issue:`295` by `Guillaume Lemaitre`_.` + and :class:`SMOTETomek`. :issue:`295` by :user:`Guillaume Lemaitre + `. - Fixed bug for `check_ratio` to be able to pass arguments when `ratio` is a - callable. :issue:`307` by `Guillaume Lemaitre`_.` + callable. :issue:`307` by :user:`Guillaume Lemaitre `. New features ~~~~~~~~~~~~ - Turn off steps in :class:`pipeline.Pipeline` using the `None` - object. By `Christos Aridas`_. + object. By :user:`Christos Aridas `. - Add a fetching function :func:`datasets.fetch_datasets` in order to get some - imbalanced datasets useful for benchmarking. :issue:`249` by `Guillaume Lemaitre`_. + imbalanced datasets useful for benchmarking. :issue:`249` by :user:`Guillaume + Lemaitre `. Enhancement ~~~~~~~~~~~ -- All samplers accepts sparse matrices with defaulting on CSR type. :issue:`316` by - `Guillaume Lemaitre`_. +- All samplers accepts sparse matrices with defaulting on CSR + type. :issue:`316` by :user:`Guillaume Lemaitre `. - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It - supports multiclass. :issue:`312` by `Guillaume Lemaitre`_. + supports multiclass. :issue:`312` by :user:`Guillaume Lemaitre `. - All the unit tests have been factorized and a :func:`utils.check_estimators` - has been derived from scikit-learn. By `Guillaume Lemaitre`_. + has been derived from scikit-learn. By :user:`Guillaume Lemaitre + `. - Script for automatic build of conda packages and uploading. :issue:`242` by - `Guillaume Lemaitre`_ + :user:`Guillaume Lemaitre ` -- Remove seaborn dependence and improve the examples. :issue:`264` by `Guillaume - Lemaitre`_. +- Remove seaborn dependence and improve the examples. :issue:`264` by + :user:`Guillaume Lemaitre `. -- adapt all classes to multi-class resampling. :issue:`290` by `Guillaume Lemaitre`_ +- adapt all classes to multi-class resampling. :issue:`290` by :user:`Guillaume + Lemaitre ` API changes summary ~~~~~~~~~~~~~~~~~~~ -- `__init__` has been removed from the :class:`base.SamplerMixin` to - create a real mixin class. :issue:`242` by `Guillaume Lemaitre`_. +- `__init__` has been removed from the :class:`base.SamplerMixin` to create a + real mixin class. :issue:`242` by :user:`Guillaume Lemaitre `. - creation of a module :mod:`exceptions` to handle consistant raising of - errors. :issue:`242` by `Guillaume Lemaitre`_. + errors. :issue:`242` by :user:`Guillaume Lemaitre `. -- creation of a module ``utils.validation`` to make checking of - recurrent patterns. :issue:`242` by `Guillaume Lemaitre`_. +- creation of a module ``utils.validation`` to make checking of recurrent + patterns. :issue:`242` by :user:`Guillaume Lemaitre `. - move the under-sampling methods in ``prototype_selection`` and - ``prototype_generation`` submodule to make a clearer dinstinction. :issue:`277` by - `Guillaume Lemaitre`_. + ``prototype_generation`` submodule to make a clearer + dinstinction. :issue:`277` by :user:`Guillaume Lemaitre `. -- change ``ratio`` such that it can adapt to multiple class problems. :issue:`290` by - `Guillaume Lemaitre`_. +- change ``ratio`` such that it can adapt to multiple class + problems. :issue:`290` by :user:`Guillaume Lemaitre `. Deprecation ~~~~~~~~~~~ -- Deprecation of the use of ``min_c_`` in :func:`datasets.make_imbalance`. :issue:`312` by - `Guillaume Lemaitre`_ +- Deprecation of the use of ``min_c_`` in + :func:`datasets.make_imbalance`. :issue:`312` by :user:`Guillaume Lemaitre + ` - Deprecation of the use of float in :func:`datasets.make_imbalance` for the - ratio parameter. :issue:`290` by `Guillaume Lemaitre`_. + ratio parameter. :issue:`290` by :user:`Guillaume Lemaitre `. - deprecate the use of float as ratio in favor of dictionary, string, or - callable. :issue:`290` by `Guillaume Lemaitre`_. - + callable. :issue:`290` by :user:`Guillaume Lemaitre `. diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 41a34338d..6546a57a0 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -1,4 +1,4 @@ -.. _changes_0_3: +.. _changes_0_4: Version 0.4 (under development) =============================== diff --git a/examples/plot_sampling_target_usage.py b/examples/plot_sampling_target_usage.py deleted file mode 100644 index f4339572d..000000000 --- a/examples/plot_sampling_target_usage.py +++ /dev/null @@ -1,220 +0,0 @@ -""" -====================================================================== -Usage of the ``sampling_strategy`` parameter for the different algorithms -======================================================================= - -This example shows the different usage of the parameter ``sampling_strategy`` for -the different family of samplers (i.e. over-sampling, under-sampling. or -cleaning methods). - -""" - -# Authors: Guillaume Lemaitre -# License: MIT - -from collections import Counter - -import numpy as np -import matplotlib.pyplot as plt - -from sklearn.datasets import load_iris - -from imblearn.datasets import make_imbalance - -from imblearn.over_sampling import RandomOverSampler -from imblearn.under_sampling import RandomUnderSampler -from imblearn.under_sampling import TomekLinks - -print(__doc__) - - -def plot_pie(y): - target_stats = Counter(y) - labels = list(target_stats.keys()) - sizes = list(target_stats.values()) - explode = tuple([0.1] * len(target_stats)) - - def make_autopct(values): - def my_autopct(pct): - total = sum(values) - val = int(round(pct * total / 100.0)) - return '{p:.2f}% ({v:d})'.format(p=pct, v=val) - return my_autopct - - fig, ax = plt.subplots() - ax.pie(sizes, explode=explode, labels=labels, shadow=True, - autopct=make_autopct(sizes)) - ax.axis('equal') - - -############################################################################### -# First, we will create an imbalanced data set from a the iris data set. - -iris = load_iris() - -print('Information of the original iris data set: \n {}'.format( - Counter(iris.target))) -plot_pie(iris.target) - -sampling_strategy = {0: 10, 1: 20, 2: 47} -X, y = make_imbalance(iris.data, iris.target, sampling_strategy=sampling_strategy) - -print('Information of the iris data set after making it' - ' imbalanced using a dict: \n sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y))) -plot_pie(y) - -############################################################################### -# Using ``sampling_strategy`` in resampling algorithms -############################################################################### - -############################################################################### -# ``sampling_strategy`` as a ``float`` -# ................................... -# -# ``sampling_strategy`` can be given a ``float``. For **under-sampling -# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by -# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and -# :math:`N_{m}` are the number of samples in the majority class after -# resampling and the number of samples in the minority class, respectively. - -# select only 2 classes since the ratio make sense in this case -binary_mask = np.bitwise_or(y == 0, y == 2) -binary_y = y[binary_mask] -binary_X = X[binary_mask] - -sampling_strategy = 0.8 - -rus = RandomUnderSampler(sampling_strategy=sampling_strategy) -X_res, y_res = rus.fit_sample(binary_X, binary_y) -print('Information of the iris data set after making it ' - 'balanced using a float and an under-sampling method: \n ' - 'sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -############################################################################### -# For **over-sampling methods**, it correspond to the ratio -# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{m}` -# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the -# minority class after resampling and the number of samples in the majority -# class, respectively. - -ros = RandomOverSampler(sampling_strategy=sampling_strategy) -X_res, y_res = ros.fit_sample(binary_X, binary_y) -print('Information of the iris data set after making it ' - 'balanced using a float and an over-sampling method: \n ' - 'sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -############################################################################### -# ``sampling_strategy`` has a ``str`` -# ................................. -# -# ``sampling_strategy`` can be given as a string which specify the class targeted -# by the resampling. With under- and over-sampling, the number of samples will -# be equalized. -# -# Note that we are using multiple classes from now on. - -sampling_strategy = 'not minority' - -rus = RandomUnderSampler(sampling_strategy=sampling_strategy) -X_res, y_res = rus.fit_sample(X, y) -print('Information of the iris data set after making it ' - 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -sampling_strategy = 'not majority' - -ros = RandomOverSampler(sampling_strategy=sampling_strategy) -X_res, y_res = ros.fit_sample(X, y) -print('Information of the iris data set after making it ' - 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -############################################################################### -# With **cleaning method**, the number of samples in each class will not be -# equalized even if targeted. - -sampling_strategy = 'not minority' -tl = TomekLinks(sampling_strategy) -X_res, y_res = tl.fit_sample(X, y) -print('Information of the iris data set after making it ' - 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -############################################################################### -# ``sampling_strategy`` as a ``dict`` -# .................................. -# -# When ``sampling_strategy`` is a ``dict``, the keys correspond to the targeted -# classes. The values correspond to the desired number of samples for each -# targeted class. This is working for both **under- and over-sampling** -# algorithms but not for the **cleaning algorithms**. Use a ``list`` instead. - - -sampling_strategy = {0: 10, 1: 15, 2: 20} - -rus = RandomUnderSampler(sampling_strategy=sampling_strategy) -X_res, y_res = rus.fit_sample(X, y) -print('Information of the iris data set after making it ' - 'balanced by under-sampling: \n sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -sampling_strategy = {0: 25, 1: 35, 2: 47} - -ros = RandomOverSampler(sampling_strategy=sampling_strategy) -X_res, y_res = ros.fit_sample(X, y) -print('Information of the iris data set after making it ' - 'balanced by over-sampling: \n sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -############################################################################### -# ``sampling_strategy`` as a ``list`` -# .................................. -# -# When ``sampling_strategy`` is a ``list``, the list contains the targeted -# classes. It is used only for **cleaning methods** and raise an error -# otherwise. - -sampling_strategy = [0, 1, 2] -tl = TomekLinks(sampling_strategy=sampling_strategy) -X_res, y_res = tl.fit_sample(X, y) -print('Information of the iris data set after making it ' - 'balanced by cleaning sampling: \n sampling_strategy={} \n y: {}' - .format(sampling_strategy, Counter(y_res))) -plot_pie(y_res) - -############################################################################### -# ``sampling_strategy`` as a callable -# .................................. -# -# When callable, function taking ``y`` and returns a ``dict``. The keys -# correspond to the targeted classes. The values correspond to the desired -# number of samples for each class. - - -def ratio_multiplier(y): - multiplier = {1: 0.7, 2: 0.95} - target_stats = Counter(y) - for key, value in target_stats.items(): - if key in multiplier: - target_stats[key] = int(value * multiplier[key]) - return target_stats - - -X_res, y_res = (RandomUnderSampler(sampling_strategy=ratio_multiplier) - .fit_sample(X, y)) - -print('Information of the iris data set after balancing using a callable' - ' mode:\n ratio={} \n y: {}'.format(ratio_multiplier, Counter(y_res))) -plot_pie(y_res) - -plt.show() diff --git a/imblearn/metrics/classification.py b/imblearn/metrics/classification.py index 68c6e762b..c79739e94 100644 --- a/imblearn/metrics/classification.py +++ b/imblearn/metrics/classification.py @@ -473,7 +473,7 @@ def geometric_mean_score(y_true, average='multiclass', sample_weight=None, correction=0.0): - """Compute the geometric mean + """Compute the geometric mean. The geometric mean (G-mean) is the root of the product of class-wise sensitivity. This measure tries to maximize the accuracy on each of the @@ -515,7 +515,7 @@ class is unrecognized by the classifier, G-mean resolves to zero. To setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. - average : str or None, optional (default=``'multiclass'``) + average : str or None, optional (default='multiclass') If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: diff --git a/imblearn/utils/_docstring.py b/imblearn/utils/_docstring.py index 56ae44106..a47bef8af 100644 --- a/imblearn/utils/_docstring.py +++ b/imblearn/utils/_docstring.py @@ -26,6 +26,7 @@ def __call__(self, obj): _random_state_docstring = \ """random_state : int, RandomState instance or None, optional (default=None) Control the randomization of the algorithm + - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 7d4ad4495..311b50703 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -417,7 +417,7 @@ def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): ``list`` instead. - When ``list``, the list contains the targeted classes. It used only - for **cleaning methods``. + for **cleaning methods**. .. warning:: ``list`` is available for **cleaning methods**. An error is raised From 05ff979e9ef57aec07384557866fe2515552eab8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 15 May 2018 00:41:46 +0200 Subject: [PATCH 30/50] DOC added tensorflow user guide --- doc/miscellaneous.rst | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst index ef263a21b..2bbad5ca0 100644 --- a/doc/miscellaneous.rst +++ b/doc/miscellaneous.rst @@ -38,3 +38,82 @@ We illustrate the use of such sampler to implement an outlier rejection estimator which can be easily used within a :class:`imblearn.pipeline.Pipeline`: :ref:`sphx_glr_auto_examples_plot_outlier_rejections.py` + +.. _generators: + +Custom generators +----------------- + +Imbalanced-learn provides specific generators for TensorFlow and Keras which +will generate balanced mini-batches. + +.. _tensorflow_generator: + +TensorFlow generator +~~~~~~~~~~~~~~~~~~~~ + +The :func:`tensorflow.balanced_batch_generator` allow to generate balanced +mini-batches using an imbalanced-learn sampler which returns indices:: + + >>> X = X.astype(np.float32) + >>> from imblearn.under_sampling import RandomUnderSampler + >>> from imblearn.tensorflow import balanced_batch_generator + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sample_weight=None, sampler=RandomUnderSampler(), + ... batch_size=10, random_state=42) + +The ``generator`` and ``steps_per_epoch`` can be used during the training of +the Tensorflow model. We will illustrate how to use this generator. First, we +can define a logistic regression model which will be optimized by a gradient +descent:: + + >>> learning_rate, epochs = 0.01, 10 + >>> input_size, output_size = X.shape[1], 3 + >>> import tensorflow as tf + >>> def init_weights(shape): + ... return tf.Variable(tf.random_normal(shape, stddev=0.01)) + >>> def accuracy(y_true, y_pred): + ... return np.mean(np.argmax(y_pred, axis=1) == y_true) + >>> # input and output + >>> data = tf.placeholder("float32", shape=[None, input_size]) + >>> targets = tf.placeholder("int32", shape=[None]) + >>> # build the model and weights + >>> W = init_weights([input_size, output_size]) + >>> b = init_weights([output_size]) + >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) + >>> # build the loss, predict, and train operator + >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + ... logits=out_act, labels=targets) + >>> loss = tf.reduce_sum(cross_entropy) + >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate) + >>> train_op = optimizer.minimize(loss) + >>> predict = tf.nn.softmax(out_act) + >>> # Initialization of all variables in the graph + >>> init = tf.global_variables_initializer() + +Once the model initialize, we train the model by iterating on balanced +mini-batches of data and minizing the loss previously defined:: + + >>> with tf.Session() as sess: + ... print('Starting training') + ... sess.run(init) + ... for e in range(epochs): + ... for i in range(steps_per_epoch): + ... X_batch, y_batch = next(training_generator) + ... feed_dict = dict() + ... feed_dict[data] = X_batch; feed_dict[targets] = y_batch + ... sess.run([train_op, loss], feed_dict=feed_dict) + ... # For each epoch, run accuracy on train and test + ... feed_dict = dict() + ... feed_dict[data] = X + ... predicts_train = sess.run(predict, feed_dict=feed_dict) + ... print("epoch: {} train accuracy: {:.3f}" + ... .format(e, accuracy(y, predicts_train))) + ... # doctest: +ELLIPSIS + Starting training + [... + +.. _keras_generator: + +Keras generator +~~~~~~~~~~~~~~~ From f291b68437a693d3e00cb0a91b502ae995abad6e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 May 2018 23:45:13 +0200 Subject: [PATCH 31/50] DOC update the user guide --- doc/miscellaneous.rst | 56 +++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst index 2bbad5ca0..5734f5c66 100644 --- a/doc/miscellaneous.rst +++ b/doc/miscellaneous.rst @@ -52,8 +52,8 @@ will generate balanced mini-batches. TensorFlow generator ~~~~~~~~~~~~~~~~~~~~ -The :func:`tensorflow.balanced_batch_generator` allow to generate balanced -mini-batches using an imbalanced-learn sampler which returns indices:: +The :func:`imblearn.tensorflow.balanced_batch_generator` allow to generate +balanced mini-batches using an imbalanced-learn sampler which returns indices:: >>> X = X.astype(np.float32) >>> from imblearn.under_sampling import RandomUnderSampler @@ -62,9 +62,9 @@ mini-batches using an imbalanced-learn sampler which returns indices:: ... X, y, sample_weight=None, sampler=RandomUnderSampler(), ... batch_size=10, random_state=42) -The ``generator`` and ``steps_per_epoch`` can be used during the training of -the Tensorflow model. We will illustrate how to use this generator. First, we -can define a logistic regression model which will be optimized by a gradient +The ``generator`` and ``steps_per_epoch`` is used during the training of the +Tensorflow model. We will illustrate how to use this generator. First, we can +define a logistic regression model which will be optimized by a gradient descent:: >>> learning_rate, epochs = 0.01, 10 @@ -91,8 +91,8 @@ descent:: >>> # Initialization of all variables in the graph >>> init = tf.global_variables_initializer() -Once the model initialize, we train the model by iterating on balanced -mini-batches of data and minizing the loss previously defined:: +Once initialized, the model is trained by iterating on balanced mini-batches of +data and minimizing the loss previously defined:: >>> with tf.Session() as sess: ... print('Starting training') @@ -100,13 +100,10 @@ mini-batches of data and minizing the loss previously defined:: ... for e in range(epochs): ... for i in range(steps_per_epoch): ... X_batch, y_batch = next(training_generator) - ... feed_dict = dict() - ... feed_dict[data] = X_batch; feed_dict[targets] = y_batch - ... sess.run([train_op, loss], feed_dict=feed_dict) + ... sess.run([train_op, loss], feed_dict={data: X_batch, targets: y_batch}) ... # For each epoch, run accuracy on train and test ... feed_dict = dict() - ... feed_dict[data] = X - ... predicts_train = sess.run(predict, feed_dict=feed_dict) + ... predicts_train = sess.run(predict, feed_dict={data: X}) ... print("epoch: {} train accuracy: {:.3f}" ... .format(e, accuracy(y, predicts_train))) ... # doctest: +ELLIPSIS @@ -117,3 +114,38 @@ mini-batches of data and minizing the loss previously defined:: Keras generator ~~~~~~~~~~~~~~~ + +Keras provides an higher level API in which a model can be defined and train by +calling ``fit_generator`` method to train the model. To illustrate, we will +define a logistic regression model:: + + >>> import keras + >>> y = keras.utils.to_categorical(y, 3) + >>> model = keras.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + +:func:`imblearn.keras.balanced_batch_generator` creates a balanced mini-batches +generator with the associated number of mini-batches which will be generated:: + + >>> from imblearn.keras import balanced_batch_generator + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42) + +Then, ``fit_generator`` can be called passing the generator and the step:: + + >>> callback_history = model.fit_generator(generator=training_generator, + ... steps_per_epoch=steps_per_epoch, + ... epochs=10, verbose=0) + +The second possibility is to use +:class:`imblearn.keras.BalancedBatchGenerator`. Only an instance of this class +will be passed to ``fit_generator``:: + + >>> from imblearn.keras import BalancedBatchGenerator + >>> training_generator = BalancedBatchGenerator( + ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... epochs=10, verbose=0) From 5b95e2634eb9278efe1dd849ecd38154bff88e02 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 May 2018 23:45:39 +0200 Subject: [PATCH 32/50] FIX rename function --- imblearn/keras/_generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index e46df80bd..dec122943 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -13,7 +13,7 @@ from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring -from ..tensorflow import balanced_batch_generator as keras_bbg +from ..tensorflow import balanced_batch_generator as tf_bbg @Substitution(random_state=_random_state_docstring) @@ -202,6 +202,6 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, """ - return keras_bbg(X=X, y=y, sample_weight=sample_weight, - sampler=sampler, batch_size=batch_size, - random_state=random_state) + return tf_bbg(X=X, y=y, sample_weight=sample_weight, + sampler=sampler, batch_size=batch_size, + random_state=random_state) From 1077e9c3241e6df08aa6ad002c6a80fd5b1a5fef Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 17 May 2018 23:53:14 +0200 Subject: [PATCH 33/50] MAINT add optional dependencies --- requirements.optional.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.optional.txt diff --git a/requirements.optional.txt b/requirements.optional.txt new file mode 100644 index 000000000..826277d5e --- /dev/null +++ b/requirements.optional.txt @@ -0,0 +1,2 @@ +keras +tensorflow From f95753197c1ff39f76e5bc4a9bc2f80ccbb94619 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 18 May 2018 00:08:07 +0200 Subject: [PATCH 34/50] DOC fix python 2 doc override --- doc/whats_new/v0.0.4.rst | 6 ++++++ imblearn/keras/_generator.py | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 6546a57a0..005d5ea97 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -18,6 +18,12 @@ API - Enable to use a ``list`` for the cleaning methods to specify the class to sample. :issue:`411` by :user:`Guillaume Lemaitre `. +New features +............ + +- Add a ``keras`` and ``tensorflow`` modules to create balanced mini-batches + generator. :issue:`409` by :user:`Guillaume Lemaitre `. + Enhancement ........... diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index dec122943..940ca0369 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -16,7 +16,6 @@ from ..tensorflow import balanced_batch_generator as tf_bbg -@Substitution(random_state=_random_state_docstring) class BalancedBatchGenerator(keras.utils.Sequence): """Create balanced batches when training a keras model. From 00d2a05e0335c32a0038b542407d0d87390fb127 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 27 Jul 2018 22:35:01 +0200 Subject: [PATCH 35/50] iter --- imblearn/keras/_generator.py | 2 +- imblearn/tensorflow/_generator.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 940ca0369..8debae10b 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -66,8 +66,8 @@ class BalancedBatchGenerator(keras.utils.Sequence): >>> class_dict = dict() >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 >>> X, y = make_imbalance(iris.data, iris.target, class_dict) - >>> y = keras.utils.to_categorical(y, 3) >>> import keras + >>> y = keras.utils.to_categorical(y, 3) >>> model = keras.models.Sequential() >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], ... activation='softmax')) diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 868dd1a89..75e8fa57e 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -2,11 +2,15 @@ from __future__ import division +import pytest + from sklearn.base import clone from sklearn.utils import safe_indexing from sklearn.utils import check_random_state from sklearn.utils.testing import set_random_state +tf = pytest.importorskip("tensorflow") + from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring From f677e6b3ecc877c594664edeb678035c85ce378d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 28 Jul 2018 21:40:03 +0200 Subject: [PATCH 36/50] EHN add parameter to preserve sparsity --- imblearn/keras/_generator.py | 60 +++++++++++++-------- imblearn/keras/tests/test_generator.py | 20 +++++++ imblearn/tensorflow/_generator.py | 36 +++++++++---- imblearn/tensorflow/tests/test_generator.py | 14 +++++ 4 files changed, 99 insertions(+), 31 deletions(-) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 8debae10b..d634a2ba0 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -3,6 +3,8 @@ import pytest +from scipy.sparse import issparse + from sklearn.base import clone from sklearn.utils import safe_indexing from sklearn.utils import check_random_state @@ -41,6 +43,11 @@ class BalancedBatchGenerator(keras.utils.Sequence): batch_size : int, optional (default=32) Number of samples per gradient update. + sparse : bool, optional (default=False) + Either or not to conserve or not the sparsity of the input (i.e. ``X``, + ``y``, ``sample_weight``). By default, the returned batches will be + dense. + random_state : int, RandomState instance or None, optional (default=None) Control the randomization of the algorithm - If int, ``random_state`` is the seed used by the random number @@ -82,12 +89,13 @@ class BalancedBatchGenerator(keras.utils.Sequence): """ def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, - random_state=None): + sparse=False, random_state=None): self.X = X self.y = y self.sample_weight = sample_weight self.sampler = sampler self.batch_size = batch_size + self.sparse = sparse self.random_state = random_state self._sample() @@ -113,32 +121,35 @@ def __len__(self): return int(self.indices_.size // self.batch_size) def __getitem__(self, index): + X_resampled = safe_indexing( + self.X, self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + if issparse(X_resampled) and not self.sparse: + X_resampled = X_resampled.toarray() + + y_resampled = safe_indexing( + self.y, self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + if issparse(y_resampled) and not self.sparse: + y_resampled = y_resampled.toarray() + + if self.sample_weight is not None: + sample_weight_resampled = safe_indexing( + self.sample_weight, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + if issparse(sample_weight_resampled) and not self.sparse: + sample_weight = sample_weight.toarray() + if self.sample_weight is None: - return ( - safe_indexing(self.X, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]), - safe_indexing(self.y, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]) - ) + return X_resampled, y_resampled else: - return ( - safe_indexing(self.X, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]), - safe_indexing(self.y, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]), - safe_indexing(self.sample_weight, - self.indices_[index * self.batch_size: - (index + 1) * self.batch_size]) - ) + return X_resampled, y_resampled, sample_weight_resampled @Substitution(random_state=_random_state_docstring) def balanced_batch_generator(X, y, sample_weight=None, sampler=None, - batch_size=32, random_state=None): + batch_size=32, sparse=False, random_state=None): """Create a balanced batch generator to train keras model. Returns a generator --- as well as the number of step per epoch --- which @@ -163,6 +174,11 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, batch_size : int, optional (default=32) Number of samples per gradient update. + sparse : bool, optional (default=False) + Either or not to conserve or not the sparsity of the input (i.e. ``X``, + ``y``, ``sample_weight``). By default, the returned batches will be + dense. + {random_state} Returns @@ -203,4 +219,4 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, return tf_bbg(X=X, y=y, sample_weight=sample_weight, sampler=sampler, batch_size=batch_size, - random_state=random_state) + sparse=sparse, random_state=random_state) diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index d94dd73da..8f880029c 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -1,6 +1,7 @@ import pytest import numpy as np +from scipy import sparse from sklearn.datasets import load_iris @@ -51,6 +52,16 @@ def test_balanced_batch_generator_class(sampler, sample_weight): epochs=10) +def test_balanced_batch_generator_class_sparse(): + training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y, + batch_size=100, + sparse=True, + random_state=42) + for idx in range(len(training_generator)): + X_batch, y_batch = training_generator.__getitem__(idx) + assert sparse.issparse(X_batch) + + def test_balanced_batch_generator_function_no_return_indices(): with pytest.raises(ValueError, match='needs to return the indices'): balanced_batch_generator( @@ -71,3 +82,12 @@ def test_balanced_batch_generator_function(sampler, sample_weight): model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=10) + + +def test_balanced_batch_generator_function_sparse(): + training_generator, steps_per_epoch = balanced_batch_generator( + sparse.csr_matrix(X), y, sparse=True, batch_size=10, + random_state=42) + for idx in range(steps_per_epoch): + X_batch, y_batch = next(training_generator) + assert sparse.issparse(X_batch) diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 75e8fa57e..3f904cc90 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -4,6 +4,8 @@ import pytest +from scipy.sparse import issparse + from sklearn.base import clone from sklearn.utils import safe_indexing from sklearn.utils import check_random_state @@ -18,7 +20,7 @@ @Substitution(random_state=_random_state_docstring) def balanced_batch_generator(X, y, sample_weight=None, sampler=None, - batch_size=32, random_state=None): + batch_size=32, sparse=False, random_state=None): """Create a balanced batch generator to train keras model. Returns a generator --- as well as the number of step per epoch --- which @@ -43,6 +45,11 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, batch_size : int, optional (default=32) Number of samples per gradient update. + sparse : bool, optional (default=False) + Either or not to conserve or not the sparsity of the input (i.e. ``X``, + ``y``, ``sample_weight``). By default, the returned batches will be + dense. + {random_state} Returns @@ -52,8 +59,7 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, y_batch) or (X_batch, y_batch, sampler_weight_batch). steps_per_epoch : int - The number of samples per epoch. Required by ``fit_generator`` in - keras. + The number of samples per epoch. Examples -------- @@ -133,15 +139,27 @@ def generator(X, y, sample_weight, indices, batch_size): if sample_weight is None: while True: for index in range(0, len(indices), batch_size): - yield (safe_indexing(X, indices[index:index + batch_size]), - safe_indexing(y, indices[index:index + batch_size])) + X_res = safe_indexing(X, indices[index:index + batch_size]) + y_res = safe_indexing(y, indices[index:index + batch_size]) + if issparse(X_res) and not sparse: + X_res = X_res.toarray() + if issparse(y_res) and not sparse: + y_res = y_res.toarray() + yield X_res, y_res else: while True: for index in range(0, len(indices), batch_size): - yield (safe_indexing(X, indices[index:index + batch_size]), - safe_indexing(y, indices[index:index + batch_size]), - safe_indexing(sample_weight, - indices[index:index + batch_size])) + X_res = safe_indexing(X, indices[index:index + batch_size]) + y_res = safe_indexing(y, indices[index:index + batch_size]) + sw_res = safe_indexing(sample_weight, + indices[index:index + batch_size]) + if issparse(X_res) and not sparse: + X_res = X_res.toarray() + if issparse(y_res) and not sparse: + y_res = y_res.toarray() + if issparse(sw_res) and not sparse: + sw_res = sw_res.toarray() + yield X_res, y_res, sw_res return (generator(X, y, sample_weight, indices, batch_size), int(indices.size // batch_size)) diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py index 2045690e4..22ca16500 100644 --- a/imblearn/tensorflow/tests/test_generator.py +++ b/imblearn/tensorflow/tests/test_generator.py @@ -2,6 +2,7 @@ import pytest import numpy as np +from scipy import sparse from sklearn.datasets import load_iris @@ -72,3 +73,16 @@ def accuracy(y_true, y_pred): predicts_train = sess.run(predict, feed_dict={data: X}) print("epoch: {} train accuracy: {:.3f}" .format(e, accuracy(y, predicts_train))) + + +def test_balanced_batch_generator_function_sparse(): + X, y = load_iris(return_X_y=True) + X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) + X = X.astype(np.float32) + + training_generator, steps_per_epoch = balanced_batch_generator( + sparse.csr_matrix(X), y, sparse=True, batch_size=10, + random_state=42) + for idx in range(steps_per_epoch): + X_batch, y_batch = next(training_generator) + assert sparse.issparse(X_batch) From ccef644af7eb73adfc66271ad5e56f1abdbaca50 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 31 Jul 2018 05:09:14 +0200 Subject: [PATCH 37/50] DOC mention default sampler --- imblearn/keras/_generator.py | 8 ++++++-- imblearn/tensorflow/_generator.py | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index d634a2ba0..dd0f92ed7 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -37,8 +37,10 @@ class BalancedBatchGenerator(keras.utils.Sequence): sample_weight : ndarray, shape (n_samples,) Sample weight. - sampler : object or None, optional (default=None) + sampler : object or None, optional (default=RandomUnderSampler) A sampler instance which has an attribute ``return_indices``. + By default, the sampler used is a + :class:`imblearn.under_sampling.RandomUnderSampler`. batch_size : int, optional (default=32) Number of samples per gradient update. @@ -168,8 +170,10 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, sample_weight : ndarray, shape (n_samples,) Sample weight. - sampler : object or None, optional (default=None) + sampler : object or None, optional (default=RandomUnderSampler) A sampler instance which has an attribute ``return_indices``. + By default, the sampler used is a + :class:`imblearn.under_sampling.RandomUnderSampler`. batch_size : int, optional (default=32) Number of samples per gradient update. diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 3f904cc90..262ece894 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -39,8 +39,10 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, sample_weight : ndarray, shape (n_samples,) Sample weight. - sampler : object or None, optional (default=None) + sampler : object or None, optional (default=RandomUnderSampler) A sampler instance which has an attribute ``return_indices``. + By default, the sampler used is a + :class:`imblearn.under_sampling.RandomUnderSampler`. batch_size : int, optional (default=32) Number of samples per gradient update. From 032c79101b3efef9c6b7ab3b1cd9b8f43edebeaf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Aug 2018 16:19:39 +0200 Subject: [PATCH 38/50] use doctest ignore import errors --- imblearn/keras/_generator.py | 6 ++---- imblearn/tensorflow/_generator.py | 4 ---- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index dd0f92ed7..98edcf7ef 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -1,7 +1,7 @@ """Implement generators for ``keras`` which will balance the data.""" from __future__ import division -import pytest +import keras from scipy.sparse import issparse @@ -10,8 +10,6 @@ from sklearn.utils import check_random_state from sklearn.utils.testing import set_random_state -keras = pytest.importorskip("keras") - from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring @@ -141,7 +139,7 @@ def __getitem__(self, index): self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) if issparse(sample_weight_resampled) and not self.sparse: - sample_weight = sample_weight.toarray() + sample_weight_resampled = sample_weight_resampled.toarray() if self.sample_weight is None: return X_resampled, y_resampled diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 262ece894..c1da5ae58 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -2,8 +2,6 @@ from __future__ import division -import pytest - from scipy.sparse import issparse from sklearn.base import clone @@ -11,8 +9,6 @@ from sklearn.utils import check_random_state from sklearn.utils.testing import set_random_state -tf = pytest.importorskip("tensorflow") - from ..under_sampling import RandomUnderSampler from ..utils import Substitution from ..utils._docstring import _random_state_docstring diff --git a/setup.cfg b/setup.cfg index 56cfb932a..b39529093 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,4 +35,4 @@ doctest-fixtures = _fixture [tool:pytest] addopts = --doctest-modules - + --doctest-ignore-import-errors From c056567da212fba639ce7b3f80dccf92fbd11e69 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Aug 2018 17:32:27 +0200 Subject: [PATCH 39/50] iter --- build_tools/travis/install.sh | 7 +++++-- imblearn/keras/_generator.py | 12 ++++++++++-- imblearn/keras/tests/test_generator.py | 10 +++++----- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index e094117cc..be06095db 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -39,7 +39,10 @@ if [[ "$DISTRIB" == "conda" ]]; then conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION - conda install --yes pandas keras + # only install optional dependency in python 3.6 + if [[ $PYTHON_VERSION == "3.6" ]]; then + conda install --yes pandas keras + fi if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython @@ -70,7 +73,7 @@ python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" -python setup.py develop +pip install -e . ccache --show-stats # Useful for debugging how ccache is used # cat $CCACHE_LOGFILE diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 98edcf7ef..251e1eb59 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -1,7 +1,13 @@ """Implement generators for ``keras`` which will balance the data.""" from __future__ import division -import keras +try: + import keras + ParentClass = keras.utils.Sequence + HAS_KERAS = True +except ImportError: + ParentClass = object + HAS_KERAS = False from scipy.sparse import issparse @@ -16,7 +22,7 @@ from ..tensorflow import balanced_batch_generator as tf_bbg -class BalancedBatchGenerator(keras.utils.Sequence): +class BalancedBatchGenerator(ParentClass): """Create balanced batches when training a keras model. Create a keras ``Sequence`` which is given to ``fit_generator``. The @@ -90,6 +96,8 @@ class BalancedBatchGenerator(keras.utils.Sequence): """ def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, sparse=False, random_state=None): + if not HAS_KERAS: + raise ImportError("'No module named 'keras'") self.X = X self.y = y self.sample_weight = sample_weight diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index 8f880029c..00c138d30 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -5,6 +5,11 @@ from sklearn.datasets import load_iris +keras = pytest.importorskip('keras') +from keras.models import Sequential +from keras.layers import Dense +from keras.utils import to_categorical + from imblearn.datasets import make_imbalance from imblearn.under_sampling import ClusterCentroids from imblearn.under_sampling import NearMiss @@ -12,11 +17,6 @@ from imblearn.keras import BalancedBatchGenerator from imblearn.keras import balanced_batch_generator -keras = pytest.importorskip('keras') -from keras.models import Sequential -from keras.layers import Dense -from keras.utils import to_categorical - iris = load_iris() X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) y = to_categorical(y, 3) From 83b5fea09946574459736b8341c99f0341a9310a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Aug 2018 17:58:18 +0200 Subject: [PATCH 40/50] iter --- conftest.py | 15 +++++++++++++++ imblearn/keras/_generator.py | 3 +++ setup.cfg | 4 +--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 31da29707..86ec47899 100644 --- a/conftest.py +++ b/conftest.py @@ -7,9 +7,24 @@ # Set numpy array str/repr to legacy behaviour on numpy > 1.13 to make # the doctests pass +import pytest import numpy as np try: np.set_printoptions(legacy='1.13') except TypeError: pass + + +def pytest_runtest_setup(item): + fname = item.fspath.strpath + if fname.endswith('keras/_generator.py'): + try: + import keras + except ImportError: + pytest.skip('The keras package is not installed.') + elif fname.endswith('tensorflow/_generator.py'): + try: + import tensorflow + except ImportError: + pytest.skip('The tensorflow package is not installed.') diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index 251e1eb59..c4a40fa3f 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -1,6 +1,9 @@ """Implement generators for ``keras`` which will balance the data.""" from __future__ import division +# This is a trick to avoid an error during tests collection with pytest. We +# avoid the error when importing the package raise the error at the moment of +# creating the instance. try: import keras ParentClass = keras.utils.Sequence diff --git a/setup.cfg b/setup.cfg index b39529093..50f9c583a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,4 @@ doctest-extension = rst doctest-fixtures = _fixture [tool:pytest] -addopts = - --doctest-modules - --doctest-ignore-import-errors +addopts = --doctest-modules From c7b4a0a00c4efd20f123f9b739929301b6aac1dd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Aug 2018 18:10:01 +0200 Subject: [PATCH 41/50] iter --- conftest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index 86ec47899..455b0f00a 100644 --- a/conftest.py +++ b/conftest.py @@ -18,12 +18,14 @@ def pytest_runtest_setup(item): fname = item.fspath.strpath - if fname.endswith('keras/_generator.py'): + if (fname.endswith('keras/_generator.py') or + fname.endswith('miscellaneous.rst')): try: import keras except ImportError: pytest.skip('The keras package is not installed.') - elif fname.endswith('tensorflow/_generator.py'): + elif (fname.endswith('tensorflow/_generator.py') or + fname.endswith('miscellaneous.rst')): try: import tensorflow except ImportError: From cd99f9a5f8a15183b8c887623cbc78e5744f39a9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Aug 2018 18:13:31 +0200 Subject: [PATCH 42/50] join path for appveyor --- conftest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index 455b0f00a..d3ff91025 100644 --- a/conftest.py +++ b/conftest.py @@ -7,6 +7,7 @@ # Set numpy array str/repr to legacy behaviour on numpy > 1.13 to make # the doctests pass +import os import pytest import numpy as np @@ -18,13 +19,13 @@ def pytest_runtest_setup(item): fname = item.fspath.strpath - if (fname.endswith('keras/_generator.py') or + if (fname.endswith(os.path.join('keras', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: import keras except ImportError: pytest.skip('The keras package is not installed.') - elif (fname.endswith('tensorflow/_generator.py') or + elif (fname.endswith(os.path.join('tensorflow', '_generator.py')) or fname.endswith('miscellaneous.rst')): try: import tensorflow From b174c7f4982e5d902d5e80c0ef77f88fc8798c34 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 Aug 2018 22:26:57 +0200 Subject: [PATCH 43/50] Update the sparse array tests --- imblearn/keras/_generator.py | 10 ++----- imblearn/keras/tests/test_generator.py | 20 +++++++++---- imblearn/tensorflow/_generator.py | 33 +++++++-------------- imblearn/tensorflow/tests/test_generator.py | 15 +++++----- 4 files changed, 34 insertions(+), 44 deletions(-) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index c4a40fa3f..c3f5a4af1 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -135,22 +135,16 @@ def __getitem__(self, index): X_resampled = safe_indexing( self.X, self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) - if issparse(X_resampled) and not self.sparse: - X_resampled = X_resampled.toarray() - y_resampled = safe_indexing( self.y, self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) - if issparse(y_resampled) and not self.sparse: - y_resampled = y_resampled.toarray() - + if issparse(X_resampled) and not self.sparse: + X_resampled = X_resampled.toarray() if self.sample_weight is not None: sample_weight_resampled = safe_indexing( self.sample_weight, self.indices_[index * self.batch_size: (index + 1) * self.batch_size]) - if issparse(sample_weight_resampled) and not self.sparse: - sample_weight_resampled = sample_weight_resampled.toarray() if self.sample_weight is None: return X_resampled, y_resampled diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index 00c138d30..ecef97979 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -52,14 +52,18 @@ def test_balanced_batch_generator_class(sampler, sample_weight): epochs=10) -def test_balanced_batch_generator_class_sparse(): +@pytest.mark.parametrize("is_sparse", [True, False]) +def test_balanced_batch_generator_class_sparse(is_sparse): training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y, batch_size=100, - sparse=True, + sparse=is_sparse, random_state=42) for idx in range(len(training_generator)): X_batch, y_batch = training_generator.__getitem__(idx) - assert sparse.issparse(X_batch) + if is_sparse: + assert sparse.issparse(X_batch) + else: + assert not sparse.issparse(X_batch) def test_balanced_batch_generator_function_no_return_indices(): @@ -84,10 +88,14 @@ def test_balanced_batch_generator_function(sampler, sample_weight): epochs=10) -def test_balanced_batch_generator_function_sparse(): +@pytest.mark.parametrize("is_sparse", [True, False]) +def test_balanced_batch_generator_function_sparse(is_sparse): training_generator, steps_per_epoch = balanced_batch_generator( - sparse.csr_matrix(X), y, sparse=True, batch_size=10, + sparse.csr_matrix(X), y, sparse=is_sparse, batch_size=10, random_state=42) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) - assert sparse.issparse(X_batch) + if is_sparse: + assert sparse.issparse(X_batch) + else: + assert not sparse.issparse(X_batch) diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index c1da5ae58..9b0cb06d5 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -44,9 +44,8 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, Number of samples per gradient update. sparse : bool, optional (default=False) - Either or not to conserve or not the sparsity of the input (i.e. ``X``, - ``y``, ``sample_weight``). By default, the returned batches will be - dense. + Either or not to conserve or not the sparsity of the input ``X``. By + default, the returned batches will be dense. {random_state} @@ -134,29 +133,17 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, random_state.shuffle(indices) def generator(X, y, sample_weight, indices, batch_size): - if sample_weight is None: - while True: - for index in range(0, len(indices), batch_size): - X_res = safe_indexing(X, indices[index:index + batch_size]) - y_res = safe_indexing(y, indices[index:index + batch_size]) - if issparse(X_res) and not sparse: - X_res = X_res.toarray() - if issparse(y_res) and not sparse: - y_res = y_res.toarray() + while True: + for index in range(0, len(indices), batch_size): + X_res = safe_indexing(X, indices[index:index + batch_size]) + y_res = safe_indexing(y, indices[index:index + batch_size]) + if issparse(X_res) and not sparse: + X_res = X_res.toarray() + if sample_weight is None: yield X_res, y_res - else: - while True: - for index in range(0, len(indices), batch_size): - X_res = safe_indexing(X, indices[index:index + batch_size]) - y_res = safe_indexing(y, indices[index:index + batch_size]) + else: sw_res = safe_indexing(sample_weight, indices[index:index + batch_size]) - if issparse(X_res) and not sparse: - X_res = X_res.toarray() - if issparse(y_res) and not sparse: - y_res = y_res.toarray() - if issparse(sw_res) and not sparse: - sw_res = sw_res.toarray() yield X_res, y_res, sw_res return (generator(X, y, sample_weight, indices, batch_size), diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py index 22ca16500..48bce2af6 100644 --- a/imblearn/tensorflow/tests/test_generator.py +++ b/imblearn/tensorflow/tests/test_generator.py @@ -14,10 +14,7 @@ tf = pytest.importorskip('tensorflow') -@pytest.mark.parametrize( - "sampler", - [None, NearMiss()] -) +@pytest.mark.parametrize("sampler", [None, NearMiss()]) def test_balanced_batch_generator(sampler): X, y = load_iris(return_X_y=True) X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) @@ -75,14 +72,18 @@ def accuracy(y_true, y_pred): .format(e, accuracy(y, predicts_train))) -def test_balanced_batch_generator_function_sparse(): +@pytest.mark.parametrize("is_sparse", [True, False]) +def test_balanced_batch_generator_function_sparse(is_sparse): X, y = load_iris(return_X_y=True) X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) X = X.astype(np.float32) training_generator, steps_per_epoch = balanced_batch_generator( - sparse.csr_matrix(X), y, sparse=True, batch_size=10, + sparse.csr_matrix(X), y, sparse=is_sparse, batch_size=10, random_state=42) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) - assert sparse.issparse(X_batch) + if is_sparse: + assert sparse.issparse(X_batch) + else: + assert not sparse.issparse(X_batch) From 8c9ae944160a39a3a5f7fdf70512c94499c773c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Aug 2018 00:15:11 +0200 Subject: [PATCH 44/50] add example --- .../porto_seguro_keras_under_sampling.py | 235 ++++++++++++++++++ imblearn/keras/tests/test_generator.py | 2 +- 2 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 examples/applications/porto_seguro_keras_under_sampling.py diff --git a/examples/applications/porto_seguro_keras_under_sampling.py b/examples/applications/porto_seguro_keras_under_sampling.py new file mode 100644 index 000000000..1477efd89 --- /dev/null +++ b/examples/applications/porto_seguro_keras_under_sampling.py @@ -0,0 +1,235 @@ +""" +========================================================== +Porto Seguro: balancing samples in mini-batches with Keras +========================================================== + +This example compares two strategies to train a neural-network on the Porto +Seguro Kaggle data set [1]_. The data set is imbalanced and we show that +balancing each mini-batch allows to improve performance and reduce the training +time. + +References +---------- + +.. [1] https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data + +""" + +# Authors: Guillaume Lemaitre +# License: MIT + +print(__doc__) + +############################################################################### +# Data loading +############################################################################### + +from collections import Counter +import pandas as pd +import numpy as np + +############################################################################### +# First, you should download the Porto Seguro data set from Kaggle. See the +# link in the introduction. + +training_data = pd.read_csv('./input/train.csv') +testing_data = pd.read_csv('./input/test.csv') + +y_train = training_data[['id', 'target']].set_index('id') +X_train = training_data.drop(['target'], axis=1).set_index('id') +X_test = testing_data.set_index('id') + +############################################################################### +# The data set is imbalanced and it will have an effect on the fitting. + +print('The data set is imbalanced: {}'.format(Counter(y_train['target']))) + +############################################################################### +# Define the pre-processing pipeline +############################################################################### + +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import FunctionTransformer +from sklearn.impute import SimpleImputer + + +def convert_float64(X): + return X.astype(np.float64) + + +############################################################################### +# We want to standard scale the numerical features while we want to one-hot +# encode the categorical features. In this regard, we make use of the +# :class:`sklearn.compose.ColumnTransformer`. + +numerical_columns = [name for name in X_train.columns + if '_calc_' in name and '_bin' not in name] +numerical_pipeline = make_pipeline( + FunctionTransformer(func=convert_float64, validate=False), + StandardScaler()) + +categorical_columns = [name for name in X_train.columns + if '_cat' in name] +categorical_pipeline = make_pipeline( + SimpleImputer(missing_values=-1, strategy='most_frequent'), + OneHotEncoder(categories='auto')) + +preprocessor = ColumnTransformer([('num', numerical_pipeline, num_col), + ('cat', categorical_pipeline, cat_col)], + remainder='drop') + +# Create an environment variable to avoid using the GPU. This can be changed. +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '-1' + +############################################################################### +# Create a neural-network +############################################################################### + +from keras.models import Sequential +from keras.layers import Activation, Dense, Dropout, BatchNormalization + + +def make_model(n_features): + model = Sequential() + model.add(Dense(200, input_shape=(n_features,), + kernel_initializer='glorot_normal')) + model.add(Activation('relu')) + model.add(BatchNormalization()) + model.add(Dropout(0.5)) + model.add(Dense(100, kernel_initializer='glorot_normal')) + model.add(Activation('relu')) + model.add(BatchNormalization()) + model.add(Dropout(0.25)) + model.add(Dense(50, kernel_initializer='glorot_normal')) + model.add(Activation('relu')) + model.add(BatchNormalization()) + model.add(Dropout(0.15)) + model.add(Dense(25, kernel_initializer='glorot_normal')) + model.add(Activation('relu')) + model.add(BatchNormalization()) + model.add(Dropout(0.1)) + model.add(Dense(1, activation='sigmoid')) + + model.compile(loss='binary_crossentropy', + optimizer='adam', + metrics=['accuracy']) + + return model + + +############################################################################### +# We create a decorator to report the computation time + +import time +from functools import wraps + + +def timeit(f): + @wraps(f) + def wrapper(*args, **kwds): + start_time = time.time() + result = f(*args, **kwds) + elapsed_time = time.time() - start_time + print('Elapsed computation time: {:.3f} secs' + .format(elapsed_time)) + return (elapsed_time, result) + return wrapper + + +############################################################################### +# The first model will be trained using the ``fit`` method and with imbalanced +# mini-batches. + +from sklearn.metrics import roc_auc_score + + +@timeit +def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test): + model = make_model(X_train.shape[1]) + model.fit(X_train, y_train, epochs=2, verbose=0, batch_size=1000) + y_pred = model.predict_proba(X_test, batch_size=1000) + return roc_auc_score(y_test, y_pred) + + +############################################################################### +# In the contrary, we will use imbalanced-learn to create a generator of +# mini-batches which will yield balanced mini-batches. + +from imblearn.keras import BalancedBatchGenerator + + +@timeit +def fit_predict_balanced_model(X_train, y_train, X_test, y_test): + model = make_model(X_train.shape[1]) + training_generator = BalancedBatchGenerator(X_train, y_train, + batch_size=1000, + random_state=42) + model.fit_generator(generator=training_generator, epochs=5, verbose=0) + y_pred = model.predict_proba(X_test, batch_size=1000) + return roc_auc_score(y_test, y_pred) + + +############################################################################### +# Classification loop +############################################################################### + +############################################################################### +# We will perform a 10-fold cross-validation and train the neural-network with +# the two different strategies previously presented. + +from sklearn.model_selection import StratifiedKFold + +skf = StratifiedKFold(n_splits=10) + +cv_results_imbalanced = [] +cv_time_imbalanced = [] +cv_results_balanced = [] +cv_time_balanced = [] +for train_idx, valid_idx in skf.split(X_train, y_train): + X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx]) + y_local_train = y_train.iloc[train_idx].values.ravel() + X_local_test = preprocessor.transform(X_train.iloc[valid_idx]) + y_local_test = y_train.iloc[valid_idx].values.ravel() + + elapsed_time, roc_auc = fit_predict_imbalanced_model( + X_local_train, y_local_train, X_local_test, y_local_test) + cv_time_imbalanced.append(elapsed_time) + cv_results_imbalanced.append(roc_auc) + + elapsed_time, roc_auc = fit_predict_balanced_model( + X_local_train, y_local_train, X_local_test, y_local_test) + cv_time_balanced.append(elapsed_time) + cv_results_balanced.append(roc_auc) + +############################################################################### +# Plot of the results and computation time +############################################################################### + +df_results = (pd.DataFrame({'Balanced model': cv_results_balanced, + 'Imbalanced model': cv_results_imbalanced}) + .unstack().reset_index()) +df_time = (pd.DataFrame({'Balanced model': cv_time_balanced, + 'Imbalanced model': cv_time_imbalanced}) + .unstack().reset_index()) + +import seaborn as sns +import matplotlib.pyplot as plt + +sns.boxplot(y='level_0', x=0, data=df_results, whis=10.0) +sns.despine(top=True, right=True, left=True) +ax = plt.gca() +ax.xaxis.set_major_formatter( + plt.FuncFormatter(lambda x, pos: "%i%%" % (100 * x))) +plt.xlabel('ROC-AUC') +plt.ylabel('') +plt.title('Difference in terms of ROC-AUC using a random under-sampling') + +sns.boxplot(y='level_0', x=0, data=df_time) +sns.despine(top=True, right=True, left=True) +plt.xlabel('time [s]') +plt.ylabel('') +plt.title('Computation time difference using a random under-sampling') diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index ecef97979..7b0491146 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -55,7 +55,7 @@ def test_balanced_batch_generator_class(sampler, sample_weight): @pytest.mark.parametrize("is_sparse", [True, False]) def test_balanced_batch_generator_class_sparse(is_sparse): training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y, - batch_size=100, + batch_size=10, sparse=is_sparse, random_state=42) for idx in range(len(training_generator)): From baa56ad1725321a973eee7c1917040c2c2bf7e59 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Aug 2018 00:22:01 +0200 Subject: [PATCH 45/50] iter --- .../applications/porto_seguro_keras_under_sampling.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/applications/porto_seguro_keras_under_sampling.py b/examples/applications/porto_seguro_keras_under_sampling.py index 1477efd89..d06669433 100644 --- a/examples/applications/porto_seguro_keras_under_sampling.py +++ b/examples/applications/porto_seguro_keras_under_sampling.py @@ -77,9 +77,10 @@ def convert_float64(X): SimpleImputer(missing_values=-1, strategy='most_frequent'), OneHotEncoder(categories='auto')) -preprocessor = ColumnTransformer([('num', numerical_pipeline, num_col), - ('cat', categorical_pipeline, cat_col)], - remainder='drop') +preprocessor = ColumnTransformer( + [('numerical_preprocessing', numerical_pipeline, numerical_columns), + ('categorical_preprocessing', categorical_pipeline, categorical_columns)], + remainder='drop') # Create an environment variable to avoid using the GPU. This can be changed. import os @@ -150,7 +151,7 @@ def wrapper(*args, **kwds): @timeit def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) - model.fit(X_train, y_train, epochs=2, verbose=0, batch_size=1000) + model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=1000) y_pred = model.predict_proba(X_test, batch_size=1000) return roc_auc_score(y_test, y_pred) @@ -168,7 +169,7 @@ def fit_predict_balanced_model(X_train, y_train, X_test, y_test): training_generator = BalancedBatchGenerator(X_train, y_train, batch_size=1000, random_state=42) - model.fit_generator(generator=training_generator, epochs=5, verbose=0) + model.fit_generator(generator=training_generator, epochs=5, verbose=1) y_pred = model.predict_proba(X_test, batch_size=1000) return roc_auc_score(y_test, y_pred) From 679f30ca906ede4a2cb63288c6dc095bb0139d28 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Aug 2018 00:53:33 +0200 Subject: [PATCH 46/50] iter example --- .../porto_seguro_keras_under_sampling.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/applications/porto_seguro_keras_under_sampling.py b/examples/applications/porto_seguro_keras_under_sampling.py index d06669433..7bc7c655c 100644 --- a/examples/applications/porto_seguro_keras_under_sampling.py +++ b/examples/applications/porto_seguro_keras_under_sampling.py @@ -220,6 +220,12 @@ def fit_predict_balanced_model(X_train, y_train, X_test, y_test): import seaborn as sns import matplotlib.pyplot as plt +sns.boxplot(y='level_0', x=0, data=df_time) +sns.despine(top=True, right=True, left=True) +plt.xlabel('time [s]') +plt.ylabel('') +plt.title('Computation time difference using a random under-sampling') + sns.boxplot(y='level_0', x=0, data=df_results, whis=10.0) sns.despine(top=True, right=True, left=True) ax = plt.gca() @@ -228,9 +234,3 @@ def fit_predict_balanced_model(X_train, y_train, X_test, y_test): plt.xlabel('ROC-AUC') plt.ylabel('') plt.title('Difference in terms of ROC-AUC using a random under-sampling') - -sns.boxplot(y='level_0', x=0, data=df_time) -sns.despine(top=True, right=True, left=True) -plt.xlabel('time [s]') -plt.ylabel('') -plt.title('Computation time difference using a random under-sampling') From 324b90c9c637d7edb693e682a7de8b5ae0569299 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Aug 2018 11:22:03 +0200 Subject: [PATCH 47/50] tests --- build_tools/travis/install.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index be06095db..df4d23378 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -40,9 +40,6 @@ if [[ "$DISTRIB" == "conda" ]]; then source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION # only install optional dependency in python 3.6 - if [[ $PYTHON_VERSION == "3.6" ]]; then - conda install --yes pandas keras - fi if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython @@ -51,6 +48,13 @@ if [[ "$DISTRIB" == "conda" ]]; then conda install --yes scikit-learn=$SKLEARN_VERSION fi + if [[ $PYTHON_VERSION == "3.6" ]]; then + conda install --yes pandas keras + KERAS_BACKEND=tensorflow + python -c "import keras.backend" + sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; + fi + conda install --yes nose pytest pytest-cov # Install nose-timer via pip pip install nose-timer codecov From 22fb0a1afc08fd4ad58a9d06633fa0e411d9cd5a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Aug 2018 11:40:46 +0200 Subject: [PATCH 48/50] iter --- .travis.yml | 6 +++--- build_tools/travis/install.sh | 6 +++--- examples/applications/porto_seguro_keras_under_sampling.py | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 36b502320..ba5b8f33e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,11 +38,11 @@ matrix: NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.19.0" - env: DISTRIB="conda" PYTHON_VERSION="3.6" NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.19.0" - - env: DISTRIB="conda" PYTHON_VERSION="3.6" + - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="master" allow_failures: - - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="master" + - env: DISTRIB="conda" PYTHON_VERSION="3.7" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index df4d23378..82dc10a97 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -39,7 +39,6 @@ if [[ "$DISTRIB" == "conda" ]]; then conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION - # only install optional dependency in python 3.6 if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython @@ -48,8 +47,9 @@ if [[ "$DISTRIB" == "conda" ]]; then conda install --yes scikit-learn=$SKLEARN_VERSION fi - if [[ $PYTHON_VERSION == "3.6" ]]; then - conda install --yes pandas keras + if [[ $PYTHON_VERSION == "3.6" ]] || [[ $PYTHON_VERSION == "3.7" ]]; then + conda install --yes pandas + conda install --yes -c conda-forge keras KERAS_BACKEND=tensorflow python -c "import keras.backend" sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; diff --git a/examples/applications/porto_seguro_keras_under_sampling.py b/examples/applications/porto_seguro_keras_under_sampling.py index 7bc7c655c..c154362d9 100644 --- a/examples/applications/porto_seguro_keras_under_sampling.py +++ b/examples/applications/porto_seguro_keras_under_sampling.py @@ -220,12 +220,14 @@ def fit_predict_balanced_model(X_train, y_train, X_test, y_test): import seaborn as sns import matplotlib.pyplot as plt +plt.figure() sns.boxplot(y='level_0', x=0, data=df_time) sns.despine(top=True, right=True, left=True) plt.xlabel('time [s]') plt.ylabel('') plt.title('Computation time difference using a random under-sampling') +plt.figure() sns.boxplot(y='level_0', x=0, data=df_results, whis=10.0) sns.despine(top=True, right=True, left=True) ax = plt.gca() From 85f6d5b530aa29e7ef2a5f672a8006ec52a49af9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Aug 2018 11:43:21 +0200 Subject: [PATCH 49/50] iter --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ba5b8f33e..650c14bb1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,7 +39,7 @@ matrix: - env: DISTRIB="conda" PYTHON_VERSION="3.6" NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.19.0" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" allow_failures: - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" From abeb0114680f92ff6ffefda90c9e2807c7a3fec8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 22 Aug 2018 12:58:18 +0200 Subject: [PATCH 50/50] iter --- build_tools/travis/install.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 82dc10a97..3a56bac81 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -40,14 +40,7 @@ if [[ "$DISTRIB" == "conda" ]]; then source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION - if [[ "$SKLEARN_VERSION" == "master" ]]; then - conda install --yes cython - pip install -U git+https://github.com/scikit-learn/scikit-learn.git - else - conda install --yes scikit-learn=$SKLEARN_VERSION - fi - - if [[ $PYTHON_VERSION == "3.6" ]] || [[ $PYTHON_VERSION == "3.7" ]]; then + if [[ $PYTHON_VERSION == "3.6" ]]; then conda install --yes pandas conda install --yes -c conda-forge keras KERAS_BACKEND=tensorflow @@ -55,6 +48,13 @@ if [[ "$DISTRIB" == "conda" ]]; then sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; fi + if [[ "$SKLEARN_VERSION" == "master" ]]; then + conda install --yes cython + pip install -U git+https://github.com/scikit-learn/scikit-learn.git + else + conda install --yes scikit-learn=$SKLEARN_VERSION + fi + conda install --yes nose pytest pytest-cov # Install nose-timer via pip pip install nose-timer codecov