EHN: Create the EasyEnsembleClassifier (#455)

glemaitre · web-flow · commit d2328df4e278 · 2018-08-24T11:19:50.000+02:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -110,6 +110,7 @@ Prototype selection
    ensemble.BalanceCascade
    ensemble.BalancedBaggingClassifier
    ensemble.EasyEnsemble
+   ensemble.EasyEnsembleClassifier
 
 .. _keras_ref:
 
diff --git a/doc/ensemble.rst b/doc/ensemble.rst
@@ -11,6 +11,11 @@ Ensemble of samplers
 Samplers
 --------
 
+.. warning::
+   Note that those:class:`EasyEnsemble` is deprecated and you should use
+   :class:`EasyEnsembleClassifier` instead. :class:`EasyEnsembleClassifier` is
+   presented in the next section.
+
 An imbalanced data set can be balanced by creating several balanced
 subsets. The module :mod:`imblearn.ensemble` allows to create such sets.
 
@@ -92,8 +97,8 @@ output of an :class:`EasyEnsemble` sampler with an ensemble of classifiers
 (i.e. ``BaggingClassifier``). Therefore, :class:`BalancedBaggingClassifier`
 takes the same parameters than the scikit-learn
 ``BaggingClassifier``. Additionally, there is two additional parameters,
-``sampling_strategy`` and ``replacement``, as in the :class:`EasyEnsemble`
-sampler::
+``sampling_strategy`` and ``replacement`` to control the behaviour of the
+random under-sampler::
 
 
   >>> from imblearn.ensemble import BalancedBaggingClassifier
@@ -127,3 +132,19 @@ each tree::
 
 See
 :ref:`sphx_glr_auto_examples_ensemble_plot_comparison_bagging_classifier.py`.
+
+A specific method which uses ``AdaBoost`` as learners in the bagging
+classifier is called EasyEnsemble. The :class:`EasyEnsembleClassifier` allows
+to bag AdaBoost learners which are trained on balanced bootstrap samples.
+Similarly to the :class:`BalancedBaggingClassifier` API, one can construct
+the ensemble as::
+
+  >>> from imblearn.ensemble import EasyEnsembleClassifier
+  >>> eec = EasyEnsembleClassifier(random_state=0)
+  >>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS
+  EasyEnsembleClassifier(...)
+  >>> y_pred = eec.predict(X_test)
+  >>> confusion_matrix(y_test, y_pred)
+  array([[  9,   1,   2],
+         [  5,  52,   2],
+         [252,  45, 882]])
diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
@@ -24,6 +24,10 @@ New features
 - Add a ``keras`` and ``tensorflow`` modules to create balanced mini-batches
   generator. :issue:`409` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Add :class:`imblearn.ensemble.EasyEnsembleClassifier` which create a bag of
+  AdaBoost classifier trained on balanced bootstrap samples.
+  :issue:`455` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Enhancement
 ...........
 
@@ -109,3 +113,8 @@ Deprecation
   :class:`imblearn.over_sampling.SVMSMOTE` and
   :class:`imblearn.over_sampling.BorderlineSMOTE`.
   :issue:`440` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- Deprecate :class:`imblearn.ensemble.EasyEnsemble` in favor of meta-estimator
+  :class:`imblearn.ensemble.EasyEnsembleClassifier` which follow the exact
+  algorithm described in the literature.
+  :issue:`455` by :user:`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py
@@ -4,8 +4,10 @@
 """
 
 from ._easy_ensemble import EasyEnsemble
+from ._easy_ensemble import EasyEnsembleClassifier
 from ._balance_cascade import BalanceCascade
 
 from ._classifier import BalancedBaggingClassifier
 
-__all__ = ['EasyEnsemble', 'BalancedBaggingClassifier', 'BalanceCascade']
+__all__ = ['EasyEnsemble', 'EasyEnsembleClassifier',
+           'BalancedBaggingClassifier', 'BalanceCascade']
diff --git a/imblearn/ensemble/_classifier.py b/imblearn/ensemble/_classifier.py
@@ -194,7 +194,7 @@ def __init__(self,
                  verbose=0,
                  ratio=None):
 
-        super(BaggingClassifier, self).__init__(
+        super(BalancedBaggingClassifier, self).__init__(
             base_estimator,
             n_estimators=n_estimators,
             max_samples=max_samples,
@@ -237,10 +237,10 @@ def fit(self, X, y):
 
         Parameters
         ----------
-        X : array-like of shape = [n_samples, n_features]
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
             The training input samples.
 
-        y : array-like, shape = [n_samples]
+        y : array-like, shape (n_samples,)
             The target values.
 
         Returns
diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
@@ -4,28 +4,41 @@
 #          Christos Aridas
 # License: MIT
 
+import numbers
+
 import numpy as np
 
+from sklearn.base import clone
 from sklearn.utils import check_random_state
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble.bagging import BaggingClassifier
+from sklearn.utils.deprecation import deprecated
 
 from .base import BaseEnsembleSampler
 from ..under_sampling import RandomUnderSampler
 from ..under_sampling.base import BaseUnderSampler
 from ..utils import Substitution
 from ..utils._docstring import _random_state_docstring
+from ..pipeline import Pipeline
 
 MAX_INT = np.iinfo(np.int32).max
 
 
 @Substitution(
     sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring)
+@deprecated('EasyEnsemble is deprecated in 0.4 and will be removed in 0.6. '
+            'Use EasyEnsembleClassifier instead.')
 class EasyEnsemble(BaseEnsembleSampler):
     """Create an ensemble sets by iteratively applying random under-sampling.
 
     This method iteratively select a random subset and make an ensemble of the
     different sets.
 
+    .. deprecated:: 0.4
+       ``EasyEnsemble`` is deprecated in 0.4 and will be removed in 0.6. Use
+       ``EasyEnsembleClassifier`` instead.
+
     Read more in the :ref:`User Guide <ensemble_samplers>`.
 
     Parameters
@@ -126,3 +139,161 @@ def _sample(self, X, y):
                     np.array(idx_under))
         else:
             return np.array(X_resampled), np.array(y_resampled)
+
+
+@Substitution(
+    sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
+    random_state=_random_state_docstring)
+class EasyEnsembleClassifier(BaggingClassifier):
+    """Bag of balanced boosted learners also known as EasyEnsemble.
+
+    This algorithm is known as EasyEnsemble [1]_. The classifier is an
+    ensemble of AdaBoost learners trained on different balanced boostrap
+    samples. The balancing is achieved by random under-sampling.
+
+    Read more in the :ref:`User Guide <ensemble_samplers>`.
+
+    Parameters
+    ----------
+    n_estimators : int, optional (default=10)
+        Number of AdaBoost learners in the ensemble.
+
+    base_estimator : object, optional (default=AdaBoostClassifier())
+        The base AdaBoost classifier used in the inner ensemble. Note that you
+        can set the number of inner learner by passing your own instance.
+
+    warm_start : bool, optional (default=False)
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble.
+
+    {sampling_strategy}
+
+    replacement : bool, optional (default=False)
+        Whether or not to sample randomly with replacement or not.
+
+    n_jobs : int, optional (default=1)
+        The number of jobs to run in parallel for both `fit` and `predict`.
+        If -1, then the number of jobs is set to the number of cores.
+
+    {random_state}
+
+    verbose : int, optional (default=0)
+        Controls the verbosity of the building process.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+
+    classes_ : array, shape (n_classes,)
+        The classes labels.
+
+    n_classes_ : int or list
+        The number of classes.
+
+    Notes
+    -----
+    The method is described in [1]_.
+
+    Supports multi-class resampling by sampling each class independently.
+
+    See also
+    --------
+    BalanceCascade, BalancedBaggingClassifier
+
+    References
+    ----------
+    .. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for
+       Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and
+       Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550,
+       April 2009.
+
+    Examples
+    --------
+
+    >>> from collections import Counter
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.metrics import confusion_matrix
+    >>> from imblearn.ensemble import \
+EasyEnsembleClassifier # doctest: +NORMALIZE_WHITESPACE
+    >>> X, y = make_classification(n_classes=2, class_sep=2,
+    ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
+    ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
+    >>> print('Original dataset shape %s' % Counter(y))
+    Original dataset shape Counter({{1: 900, 0: 100}})
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ...                                                     random_state=0)
+    >>> eec = EasyEnsembleClassifier(random_state=42)
+    >>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS
+    EasyEnsembleClassifier(...)
+    >>> y_pred = eec.predict(X_test)
+    >>> print(confusion_matrix(y_test, y_pred))
+    [[ 23   0]
+     [  2 225]]
+
+    """
+    def __init__(self, n_estimators=10, base_estimator=None, warm_start=False,
+                 sampling_strategy='auto', replacement=False, n_jobs=1,
+                 random_state=None, verbose=0):
+        super(EasyEnsembleClassifier, self).__init__(
+            base_estimator,
+            n_estimators=n_estimators,
+            max_samples=1.0,
+            max_features=1.0,
+            bootstrap=False,
+            bootstrap_features=False,
+            oob_score=False,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose)
+        self.sampling_strategy = sampling_strategy
+        self.replacement = replacement
+
+    def _validate_estimator(self, default=AdaBoostClassifier()):
+        """Check the estimator and the n_estimator attribute, set the
+        `base_estimator_` attribute."""
+        if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
+            raise ValueError("n_estimators must be an integer, "
+                             "got {0}.".format(type(self.n_estimators)))
+
+        if self.n_estimators <= 0:
+            raise ValueError("n_estimators must be greater than zero, "
+                             "got {0}.".format(self.n_estimators))
+
+        if self.base_estimator is not None:
+            base_estimator = clone(self.base_estimator)
+        else:
+            base_estimator = clone(default)
+
+        self.base_estimator_ = Pipeline(
+            [('sampler', RandomUnderSampler(
+                sampling_strategy=self.sampling_strategy,
+                replacement=self.replacement)),
+             ('classifier', base_estimator)])
+
+    def fit(self, X, y):
+        """Build a Bagging ensemble of AdaBoost classifier using balanced
+        boostrasp with random under-sampling.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The training input samples.
+
+        y : array-like, shape (n_samples,)
+            The target values.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+        """
+        # RandomUnderSampler is not supporting sample_weight. We need to pass
+        # None.
+        return self._fit(X, y, self.max_samples, sample_weight=None)
diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py