|
4 | 4 | # Christos Aridas
|
5 | 5 | # License: MIT
|
6 | 6 |
|
| 7 | +import numbers |
| 8 | + |
7 | 9 | import numpy as np
|
8 | 10 |
|
| 11 | +from sklearn.base import clone |
9 | 12 | from sklearn.utils import check_random_state
|
| 13 | +from sklearn.ensemble import AdaBoostClassifier |
| 14 | +from sklearn.ensemble.bagging import BaggingClassifier |
| 15 | +from sklearn.utils.deprecation import deprecated |
10 | 16 |
|
11 | 17 | from .base import BaseEnsembleSampler
|
12 | 18 | from ..under_sampling import RandomUnderSampler
|
13 | 19 | from ..under_sampling.base import BaseUnderSampler
|
14 | 20 | from ..utils import Substitution
|
15 | 21 | from ..utils._docstring import _random_state_docstring
|
| 22 | +from ..pipeline import Pipeline |
16 | 23 |
|
17 | 24 | MAX_INT = np.iinfo(np.int32).max
|
18 | 25 |
|
19 | 26 |
|
20 | 27 | @Substitution(
|
21 | 28 | sampling_strategy=BaseUnderSampler._sampling_strategy_docstring,
|
22 | 29 | random_state=_random_state_docstring)
|
| 30 | +@deprecated('EasyEnsemble is deprecated in 0.4 and will be removed in 0.6. ' |
| 31 | + 'Use EasyEnsembleClassifier instead.') |
23 | 32 | class EasyEnsemble(BaseEnsembleSampler):
|
24 | 33 | """Create an ensemble sets by iteratively applying random under-sampling.
|
25 | 34 |
|
26 | 35 | This method iteratively select a random subset and make an ensemble of the
|
27 | 36 | different sets.
|
28 | 37 |
|
| 38 | + .. deprecated:: 0.4 |
| 39 | + ``EasyEnsemble`` is deprecated in 0.4 and will be removed in 0.6. Use |
| 40 | + ``EasyEnsembleClassifier`` instead. |
| 41 | +
|
29 | 42 | Read more in the :ref:`User Guide <ensemble_samplers>`.
|
30 | 43 |
|
31 | 44 | Parameters
|
@@ -126,3 +139,161 @@ def _sample(self, X, y):
|
126 | 139 | np.array(idx_under))
|
127 | 140 | else:
|
128 | 141 | return np.array(X_resampled), np.array(y_resampled)
|
| 142 | + |
| 143 | + |
| 144 | +@Substitution( |
| 145 | + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, |
| 146 | + random_state=_random_state_docstring) |
| 147 | +class EasyEnsembleClassifier(BaggingClassifier): |
| 148 | + """Bag of balanced boosted learners also known as EasyEnsemble. |
| 149 | +
|
| 150 | + This algorithm is known as EasyEnsemble [1]_. The classifier is an |
| 151 | + ensemble of AdaBoost learners trained on different balanced boostrap |
| 152 | + samples. The balancing is achieved by random under-sampling. |
| 153 | +
|
| 154 | + Read more in the :ref:`User Guide <ensemble_samplers>`. |
| 155 | +
|
| 156 | + Parameters |
| 157 | + ---------- |
| 158 | + n_estimators : int, optional (default=10) |
| 159 | + Number of AdaBoost learners in the ensemble. |
| 160 | +
|
| 161 | + base_estimator : object, optional (default=AdaBoostClassifier()) |
| 162 | + The base AdaBoost classifier used in the inner ensemble. Note that you |
| 163 | + can set the number of inner learner by passing your own instance. |
| 164 | +
|
| 165 | + warm_start : bool, optional (default=False) |
| 166 | + When set to True, reuse the solution of the previous call to fit |
| 167 | + and add more estimators to the ensemble, otherwise, just fit |
| 168 | + a whole new ensemble. |
| 169 | +
|
| 170 | + {sampling_strategy} |
| 171 | +
|
| 172 | + replacement : bool, optional (default=False) |
| 173 | + Whether or not to sample randomly with replacement or not. |
| 174 | +
|
| 175 | + n_jobs : int, optional (default=1) |
| 176 | + The number of jobs to run in parallel for both `fit` and `predict`. |
| 177 | + If -1, then the number of jobs is set to the number of cores. |
| 178 | +
|
| 179 | + {random_state} |
| 180 | +
|
| 181 | + verbose : int, optional (default=0) |
| 182 | + Controls the verbosity of the building process. |
| 183 | +
|
| 184 | + Attributes |
| 185 | + ---------- |
| 186 | + base_estimator_ : estimator |
| 187 | + The base estimator from which the ensemble is grown. |
| 188 | +
|
| 189 | + estimators_ : list of estimators |
| 190 | + The collection of fitted base estimators. |
| 191 | +
|
| 192 | + classes_ : array, shape (n_classes,) |
| 193 | + The classes labels. |
| 194 | +
|
| 195 | + n_classes_ : int or list |
| 196 | + The number of classes. |
| 197 | +
|
| 198 | + Notes |
| 199 | + ----- |
| 200 | + The method is described in [1]_. |
| 201 | +
|
| 202 | + Supports multi-class resampling by sampling each class independently. |
| 203 | +
|
| 204 | + See also |
| 205 | + -------- |
| 206 | + BalanceCascade, BalancedBaggingClassifier |
| 207 | +
|
| 208 | + References |
| 209 | + ---------- |
| 210 | + .. [1] X. Y. Liu, J. Wu and Z. H. Zhou, "Exploratory Undersampling for |
| 211 | + Class-Imbalance Learning," in IEEE Transactions on Systems, Man, and |
| 212 | + Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, |
| 213 | + April 2009. |
| 214 | +
|
| 215 | + Examples |
| 216 | + -------- |
| 217 | +
|
| 218 | + >>> from collections import Counter |
| 219 | + >>> from sklearn.datasets import make_classification |
| 220 | + >>> from sklearn.model_selection import train_test_split |
| 221 | + >>> from sklearn.metrics import confusion_matrix |
| 222 | + >>> from imblearn.ensemble import \ |
| 223 | +EasyEnsembleClassifier # doctest: +NORMALIZE_WHITESPACE |
| 224 | + >>> X, y = make_classification(n_classes=2, class_sep=2, |
| 225 | + ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, |
| 226 | + ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) |
| 227 | + >>> print('Original dataset shape %s' % Counter(y)) |
| 228 | + Original dataset shape Counter({{1: 900, 0: 100}}) |
| 229 | + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, |
| 230 | + ... random_state=0) |
| 231 | + >>> eec = EasyEnsembleClassifier(random_state=42) |
| 232 | + >>> eec.fit(X_train, y_train) # doctest: +ELLIPSIS |
| 233 | + EasyEnsembleClassifier(...) |
| 234 | + >>> y_pred = eec.predict(X_test) |
| 235 | + >>> print(confusion_matrix(y_test, y_pred)) |
| 236 | + [[ 23 0] |
| 237 | + [ 2 225]] |
| 238 | +
|
| 239 | + """ |
| 240 | + def __init__(self, n_estimators=10, base_estimator=None, warm_start=False, |
| 241 | + sampling_strategy='auto', replacement=False, n_jobs=1, |
| 242 | + random_state=None, verbose=0): |
| 243 | + super(EasyEnsembleClassifier, self).__init__( |
| 244 | + base_estimator, |
| 245 | + n_estimators=n_estimators, |
| 246 | + max_samples=1.0, |
| 247 | + max_features=1.0, |
| 248 | + bootstrap=False, |
| 249 | + bootstrap_features=False, |
| 250 | + oob_score=False, |
| 251 | + warm_start=warm_start, |
| 252 | + n_jobs=n_jobs, |
| 253 | + random_state=random_state, |
| 254 | + verbose=verbose) |
| 255 | + self.sampling_strategy = sampling_strategy |
| 256 | + self.replacement = replacement |
| 257 | + |
| 258 | + def _validate_estimator(self, default=AdaBoostClassifier()): |
| 259 | + """Check the estimator and the n_estimator attribute, set the |
| 260 | + `base_estimator_` attribute.""" |
| 261 | + if not isinstance(self.n_estimators, (numbers.Integral, np.integer)): |
| 262 | + raise ValueError("n_estimators must be an integer, " |
| 263 | + "got {0}.".format(type(self.n_estimators))) |
| 264 | + |
| 265 | + if self.n_estimators <= 0: |
| 266 | + raise ValueError("n_estimators must be greater than zero, " |
| 267 | + "got {0}.".format(self.n_estimators)) |
| 268 | + |
| 269 | + if self.base_estimator is not None: |
| 270 | + base_estimator = clone(self.base_estimator) |
| 271 | + else: |
| 272 | + base_estimator = clone(default) |
| 273 | + |
| 274 | + self.base_estimator_ = Pipeline( |
| 275 | + [('sampler', RandomUnderSampler( |
| 276 | + sampling_strategy=self.sampling_strategy, |
| 277 | + replacement=self.replacement)), |
| 278 | + ('classifier', base_estimator)]) |
| 279 | + |
| 280 | + def fit(self, X, y): |
| 281 | + """Build a Bagging ensemble of AdaBoost classifier using balanced |
| 282 | + boostrasp with random under-sampling. |
| 283 | +
|
| 284 | + Parameters |
| 285 | + ---------- |
| 286 | + X : {array-like, sparse matrix}, shape (n_samples, n_features) |
| 287 | + The training input samples. |
| 288 | +
|
| 289 | + y : array-like, shape (n_samples,) |
| 290 | + The target values. |
| 291 | +
|
| 292 | + Returns |
| 293 | + ------- |
| 294 | + self : object |
| 295 | + Returns self. |
| 296 | + """ |
| 297 | + # RandomUnderSampler is not supporting sample_weight. We need to pass |
| 298 | + # None. |
| 299 | + return self._fit(X, y, self.max_samples, sample_weight=None) |
0 commit comments