|
| 1 | +""" |
| 2 | +================================= |
| 3 | +Bagging classifiers using sampler |
| 4 | +================================= |
| 5 | +
|
| 6 | +In this example, we show how |
| 7 | +:class:`~imblearn.ensemble.BalancedBaggingClassifier` can be used to create a |
| 8 | +large variety of classifiers by giving different samplers. |
| 9 | +
|
| 10 | +We will give several examples that have been published in the passed year. |
| 11 | +""" |
| 12 | + |
| 13 | +# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com> |
| 14 | +# License: MIT |
| 15 | + |
| 16 | +# %% |
| 17 | +print(__doc__) |
| 18 | + |
| 19 | +# %% [markdown] |
| 20 | +# Generate an imbalanced dataset |
| 21 | +# ------------------------------ |
| 22 | +# |
| 23 | +# For this example, we will create a synthetic dataset using the function |
| 24 | +# :func:`~sklearn.datasets.make_classification`. The problem will be a toy |
| 25 | +# classification problem with a ratio of 1:9 between the two classes. |
| 26 | + |
| 27 | +# %% |
| 28 | +from sklearn.datasets import make_classification |
| 29 | + |
| 30 | +X, y = make_classification( |
| 31 | + n_samples=10_000, |
| 32 | + n_features=10, |
| 33 | + weights=[0.1, 0.9], |
| 34 | + class_sep=0.5, |
| 35 | + random_state=0, |
| 36 | +) |
| 37 | + |
| 38 | +# %% |
| 39 | +import pandas as pd |
| 40 | + |
| 41 | +pd.Series(y).value_counts(normalize=True) |
| 42 | + |
| 43 | +# %% [markdown] |
| 44 | +# In the following sections, we will show a couple of algorithms that have |
| 45 | +# been proposed over the years. We intend to illustrate how one can reuse the |
| 46 | +# :class:`~imblearn.ensemble.BalancedBaggingClassifier` by passing different |
| 47 | +# sampler. |
| 48 | + |
| 49 | +# %% |
| 50 | +from sklearn.model_selection import cross_validate |
| 51 | +from sklearn.ensemble import BaggingClassifier |
| 52 | + |
| 53 | +ebb = BaggingClassifier() |
| 54 | +cv_results = cross_validate(ebb, X, y, scoring="balanced_accuracy") |
| 55 | + |
| 56 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 57 | + |
| 58 | +# %% [markdown] |
| 59 | +# Exactly Balanced Bagging and Over-Bagging |
| 60 | +# ----------------------------------------- |
| 61 | +# |
| 62 | +# The :class:`~imblearn.ensemble.BalancedBaggingClassifier` can use in |
| 63 | +# conjunction with a :class:`~imblearn.under_sampling.RandomUnderSampler` or |
| 64 | +# :class:`~imblearn.over_sampling.RandomOverSampler`. These methods are |
| 65 | +# referred as Exactly Balanced Bagging and Over-Bagging, respectively and have |
| 66 | +# been proposed first in [1]_. |
| 67 | + |
| 68 | +# %% |
| 69 | +from imblearn.ensemble import BalancedBaggingClassifier |
| 70 | +from imblearn.under_sampling import RandomUnderSampler |
| 71 | + |
| 72 | +# Exactly Balanced Bagging |
| 73 | +ebb = BalancedBaggingClassifier(sampler=RandomUnderSampler()) |
| 74 | +cv_results = cross_validate(ebb, X, y, scoring="balanced_accuracy") |
| 75 | + |
| 76 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 77 | + |
| 78 | +# %% |
| 79 | +from imblearn.over_sampling import RandomOverSampler |
| 80 | + |
| 81 | +# Over-bagging |
| 82 | +over_bagging = BalancedBaggingClassifier(sampler=RandomOverSampler()) |
| 83 | +cv_results = cross_validate(over_bagging, X, y, scoring="balanced_accuracy") |
| 84 | + |
| 85 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 86 | + |
| 87 | +# %% [markdown] |
| 88 | +# SMOTE-Bagging |
| 89 | +# ------------- |
| 90 | +# |
| 91 | +# Instead of using a :class:`~imblearn.over_sampling.RandomOverSampler` that |
| 92 | +# make a bootstrap, an alternative is to use |
| 93 | +# :class:`~imblearn.over_sampling.SMOTE` as an over-sampler. This is known as |
| 94 | +# SMOTE-Bagging [2]_. |
| 95 | + |
| 96 | +# %% |
| 97 | +from imblearn.over_sampling import SMOTE |
| 98 | + |
| 99 | +# SMOTE-Bagging |
| 100 | +smote_bagging = BalancedBaggingClassifier(sampler=SMOTE()) |
| 101 | +cv_results = cross_validate(smote_bagging, X, y, scoring="balanced_accuracy") |
| 102 | + |
| 103 | +print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
| 104 | + |
| 105 | +# %% [markdown] |
| 106 | +# .. topic:: References: |
| 107 | +# |
| 108 | +# .. [1] R. Maclin, and D. Opitz. "An empirical evaluation of bagging and |
| 109 | +# boosting." AAAI/IAAI 1997 (1997): 546-551. |
| 110 | +# |
| 111 | +# .. [2] S. Wang, and X. Yao. "Diversity analysis on imbalanced data sets by |
| 112 | +# using ensemble models." 2009 IEEE symposium on computational |
| 113 | +# intelligence and data mining. IEEE, 2009. |
0 commit comments