EHN add BalancedBaggingClassifier

glemaitre · glemaitre · commit 3aeddf3a187b · 2017-08-11T15:01:16.000+02:00
diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py
@@ -3,7 +3,7 @@
 under-sampled subsets combined inside an ensemble.
 """
 
-from .easy_ensemble import EasyEnsemble
+from .easy_ensemble import EasyEnsemble, BalancedBaggingClassifier
 from .balance_cascade import BalanceCascade
 
-__all__ = ['EasyEnsemble', 'BalanceCascade']
+__all__ = ['EasyEnsemble', 'BalancedBaggingClassifier', 'BalanceCascade']
diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py
@@ -4,16 +4,44 @@
 #          Christos Aridas
 # License: MIT
 
+import numbers
+
 import numpy as np
 
-from sklearn.utils import check_random_state
+import sklearn
+from sklearn.base import clone
+from sklearn.ensemble import BaggingClassifier
+from sklearn.ensemble.bagging import _generate_bagging_indices
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils import check_random_state, indices_to_mask
 
 from .base import BaseEnsembleSampler
+from ..pipeline import Pipeline
 from ..under_sampling import RandomUnderSampler
 
 MAX_INT = np.iinfo(np.int32).max
 
 
+old_generate = _generate_bagging_indices
+
+
+def _masked_bagging_indices(random_state, bootstrap_features,
+                            bootstrap_samples, n_features, n_samples,
+                            max_features, max_samples):
+    """Monkey-patch to always get a mask instead of indices"""
+    feature_indices, sample_indices = old_generate(random_state,
+                                                   bootstrap_features,
+                                                   bootstrap_samples,
+                                                   n_features, n_samples,
+                                                   max_features, max_samples)
+    sample_indices = indices_to_mask(sample_indices, n_samples)
+
+    return feature_indices, sample_indices
+
+
+sklearn.ensemble.bagging._generate_bagging_indices = _masked_bagging_indices
+
+
 class EasyEnsemble(BaseEnsembleSampler):
     """Create an ensemble sets by iteratively applying random under-sampling.
 
@@ -147,3 +175,176 @@ def _sample(self, X, y):
                     np.array(idx_under))
         else:
             return np.array(X_resampled), np.array(y_resampled)
+
+
+class BalancedBaggingClassifier(BaggingClassifier):
+    """A Bagging classifier with additional balancing.
+
+    This implementation of Bagging is similar to the scikit-learn
+    implementation. It includes an additional step to balance the training set
+    at fit time using a ``RandomUnderSampler``.
+
+    Read more in the :ref:`User Guide <bagging>`.
+
+    Parameters
+    ----------
+    base_estimator : object or None, optional (default=None)
+        The base estimator to fit on random subsets of the dataset.
+        If None, then the base estimator is a decision tree.
+
+    n_estimators : int, optional (default=10)
+        The number of base estimators in the ensemble.
+
+    max_samples : int or float, optional (default=1.0)
+        The number of samples to draw from X to train each base estimator.
+            - If int, then draw `max_samples` samples.
+            - If float, then draw `max_samples * X.shape[0]` samples.
+
+    max_features : int or float, optional (default=1.0)
+        The number of features to draw from X to train each base estimator.
+            - If int, then draw `max_features` features.
+            - If float, then draw `max_features * X.shape[1]` features.
+
+    bootstrap : boolean, optional (default=True)
+        Whether samples are drawn with replacement.
+
+    bootstrap_features : boolean, optional (default=False)
+        Whether features are drawn with replacement.
+
+    oob_score : bool
+        Whether to use out-of-bag samples to estimate
+        the generalization error.
+
+    warm_start : bool, optional (default=False)
+        When set to True, reuse the solution of the previous call to fit
+        and add more estimators to the ensemble, otherwise, just fit
+        a whole new ensemble.
+        .. versionadded:: 0.17
+           *warm_start* constructor parameter.
+
+    ratio : str, dict, or callable, optional (default='auto')
+        Ratio to use for resampling the data set.
+
+        - If ``str``, has to be one of: (i) ``'minority'``: resample the
+          minority class; (ii) ``'majority'``: resample the majority class,
+          (iii) ``'not minority'``: resample all classes apart of the minority
+          class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``:
+          correspond to ``'all'`` with for over-sampling methods and ``'not
+          minority'`` for under-sampling methods. The classes targeted will be
+          over-sampled or under-sampled to achieve an equal number of sample
+          with the majority or minority class.
+        - If ``dict``, the keys correspond to the targeted classes. The values
+          correspond to the desired number of samples.
+        - If callable, function taking ``y`` and returns a ``dict``. The keys
+          correspond to the targeted classes. The values correspond to the
+          desired number of samples.
+
+    replacement : bool, optional (default=False)
+        Whether or not to sample randomly with replacement or not.
+
+    n_jobs : int, optional (default=1)
+        The number of jobs to run in parallel for both `fit` and `predict`.
+        If -1, then the number of jobs is set to the number of cores.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    verbose : int, optional (default=0)
+        Controls the verbosity of the building process.
+
+    Attributes
+    ----------
+    base_estimator_ : estimator
+        The base estimator from which the ensemble is grown.
+
+    estimators_ : list of estimators
+        The collection of fitted base estimators.
+
+    estimators_samples_ : list of arrays
+        The subset of drawn samples (i.e., the in-bag samples) for each base
+        estimator. Each subset is defined by a boolean mask.
+
+    estimators_features_ : list of arrays
+        The subset of drawn features for each base estimator.
+
+    classes_ : array of shape = [n_classes]
+        The classes labels.
+
+    n_classes_ : int or list
+        The number of classes.
+
+    oob_score_ : float
+        Score of the training dataset obtained using an out-of-bag estimate.
+
+    oob_decision_function_ : array of shape = [n_samples, n_classes]
+        Decision function computed with out-of-bag estimate on the training
+        set. If n_estimators is small it might be possible that a data point
+        was never left out during the bootstrap. In this case,
+        `oob_decision_function_` might contain NaN.
+
+    References
+    ----------
+    .. [1] L. Breiman, "Pasting small votes for classification in large
+           databases and on-line", Machine Learning, 36(1), 85-103, 1999.
+    .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
+           1996.
+    .. [3] T. Ho, "The random subspace method for constructing decision
+           forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
+           1998.
+    .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
+           Learning and Knowledge Discovery in Databases, 346-361, 2012.
+
+    """
+    def __init__(self,
+                 base_estimator=None,
+                 n_estimators=10,
+                 max_samples=1.0,
+                 max_features=1.0,
+                 bootstrap=True,
+                 bootstrap_features=False,
+                 oob_score=False,
+                 warm_start=False,
+                 ratio='auto',
+                 replacement=False,
+                 n_jobs=1,
+                 random_state=None,
+                 verbose=0):
+
+        super(BaggingClassifier, self).__init__(
+            base_estimator,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=max_features,
+            bootstrap=bootstrap,
+            bootstrap_features=bootstrap_features,
+            oob_score=oob_score,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose)
+        self.ratio = ratio
+        self.replacement = replacement
+
+    def _validate_estimator(self, default=DecisionTreeClassifier()):
+        """Check the estimator and the n_estimator attribute, set the
+        `base_estimator_` attribute."""
+        if not isinstance(self.n_estimators, (numbers.Integral, np.integer)):
+            raise ValueError("n_estimators must be an integer, "
+                             "got {0}.".format(type(self.n_estimators)))
+
+        if self.n_estimators <= 0:
+            raise ValueError("n_estimators must be greater than zero, "
+                             "got {0}.".format(self.n_estimators))
+
+        if self.base_estimator is not None:
+            base_estimator = clone(self.base_estimator)
+        else:
+            base_estimator = clone(default)
+
+        self.base_estimator_ = Pipeline(
+            [('sampler', RandomUnderSampler(ratio=self.ratio,
+                                            replacement=self.replacement)),
+             ('classifier', base_estimator)])
diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py