EHN add BalancedBaggingClassifier

glemaitre · glemaitre · commit b457ab3dcb71 · 2017-08-10T12:28:15.000+02:00
diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py
@@ -3,7 +3,7 @@
 under-sampled subsets combined inside an ensemble.
 """
 
-from .easy_ensemble import EasyEnsemble
+from .easy_ensemble import EasyEnsemble, BalancedBaggingClassifier
 from .balance_cascade import BalanceCascade
 
-__all__ = ['EasyEnsemble', 'BalanceCascade']
+__all__ = ['EasyEnsemble', 'BalancedBaggingClassifier', 'BalanceCascade']
diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py
@@ -4,11 +4,16 @@
 #          Christos Aridas
 # License: MIT
 
+import numbers
+
 import numpy as np
 
+from sklearn.ensemble import BaggingClassifier
+from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils import check_random_state
 
 from .base import BaseEnsembleSampler
+from ..pipeline import Pipeline
 from ..under_sampling import RandomUnderSampler
 
 MAX_INT = np.iinfo(np.int32).max
@@ -147,3 +152,178 @@ def _sample(self, X, y):
                     np.array(idx_under))
         else:
             return np.array(X_resampled), np.array(y_resampled)
+
+
+class BalancedBaggingClassifier(BaggingClassifier):
+    """A Bagging classifier with additional balancing.
+
+    This implementation of Bagging is similar to the scikit-learn
+    implementation. It includes an additional step to balance the training set
+    at fit time using a ``RandomUnderSampler``.
+
+    Read more in the :ref:`User Guide <bagging>`.
+
+    Parameters
+    ----------
+    base_estimator : object or None, optional (default=None)
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a decision tree.
+
+    n_estimators : int, optional (default=10)
+        The number of base estimators in the ensemble.
+
+    max_samples : int or float, optional (default=1.0)
+        The number of samples to draw from X to train each base estimator.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+
+    max_features : int or float, optional (default=1.0)
+        The number of features to draw from X to train each base estimator.
+            - If int, then draw `max_features` features.
+            - If float, then draw `max_features * X.shape[1]` features.
+
+    bootstrap : boolean, optional (default=True)
+        Whether samples are drawn with replacement.
+
+    bootstrap_features : boolean, optional (default=False)
+        Whether features are drawn with replacement.
+
+    oob_score : bool
+        Whether to use out-of-bag samples to estimate
+        the generalization error.
+
+    warm_start : bool, optional (default=False)
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble.
+        .. versionadded:: 0.17
+           *warm_start* constructor parameter.
+
+    ratio : str, dict, or callable, optional (default='auto')
+        Ratio to use for resampling the data set.
+
+        - If ``str``, has to be one of: (i) ``'minority'``: resample the
+          minority class; (ii) ``'majority'``: resample the majority class,
+          (iii) ``'not minority'``: resample all classes apart of the minority
+          class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``:
+          correspond to ``'all'`` with for over-sampling methods and ``'not
+          minority'`` for under-sampling methods. The classes targeted will be
+          over-sampled or under-sampled to achieve an equal number of sample
+          with the majority or minority class.
+        - If ``dict``, the keys correspond to the targeted classes. The values
+          correspond to the desired number of samples.
+        - If callable, function taking ``y`` and returns a ``dict``. The keys
+          correspond to the targeted classes. The values correspond to the
+          desired number of samples.
+
+    replacement : bool, optional (default=False)
+        Whether or not to sample randomly with replacement or not.
+
+    n_jobs : int, optional (default=1)
+        The number of jobs to run in parallel for both `fit` and `predict`.
+        If -1, then the number of jobs is set to the number of cores.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    verbose : int, optional (default=0)
+        Controls the verbosity of the building process.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by a boolean mask.
+
+    estimators_features_ : list of arrays
+        The subset of drawn features for each base estimator.
+
+    classes_ : array of shape = [n_classes]
+        The classes labels.
+
+    n_classes_ : int or list
+        The number of classes.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+
+    oob_decision_function_ : array of shape = [n_samples, n_classes]
+        Decision function computed with out-of-bag estimate on the training
+        set. If n_estimators is small it might be possible that a data point
+        was never left out during the bootstrap. In this case,
+        `oob_decision_function_` might contain NaN.
+
+    References
+    ----------
+    .. [1] L. Breiman, "Pasting small votes for classification in large
+           databases and on-line", Machine Learning, 36(1), 85-103, 1999.
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
+           1996.
+    .. [3] T. Ho, "The random subspace method for constructing decision
+           forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
+           1998.
+    .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
+           Learning and Knowledge Discovery in Databases, 346-361, 2012.
+
+    """
+    def __init__(self,
+                 base_estimator=None,
+                 n_estimators=10,
+                 max_samples=1.0,
+                 max_features=1.0,
+                 bootstrap=True,
+                 bootstrap_features=False,
+                 oob_score=False,
+                 warm_start=False,
+                 ratio='auto',
+                 replacement=False,
+                 n_jobs=1,
+                 random_state=None,
+                 verbose=0):
+
+        super(BaggingClassifier, self).__init__(
+            base_estimator,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=max_features,
+            bootstrap=bootstrap,
+            bootstrap_features=bootstrap_features,
+            oob_score=oob_score,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose)
+        self.ratio = ratio
+        self.replacement = replacement
+
+    def _validate_estimator(self, default=DecisionTreeClassifier()):
+        """Check the estimator and the n_estimator attribute, set the
+        `base_estimator_` attribute."""
+        if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
+            raise ValueError("n_estimators must be an integer, "
+                             "got {0}.".format(type(self.n_estimators)))
+
+        if self.n_estimators <= 0:
+            raise ValueError("n_estimators must be greater than zero, "
+                             "got {0}.".format(self.n_estimators))
+
+        if self.base_estimator is not None:
+            self.base_estimator_ = Pipeline(
+                [('sampler', RandomUnderSampler()),
+                 ('estimator', self.base_estimator)])
+        else:
+            self.base_estimator_ = Pipeline(
+                [('sampler', RandomUnderSampler()),
+                 ('estimator', default)])
+
+        if self.base_estimator_ is None:
+            raise ValueError("base_estimator cannot be None")
diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py