|
105 | 105 | # %% [markdown]
|
106 | 106 | # Roughly Balanced Bagging
|
107 | 107 | # ------------------------
|
108 |
| -# FIXME: narration based on [3]_. |
| 108 | +# While using a :class:`~imblearn.under_sampling.RandomUnderSampler` or |
| 109 | +# :class:`~imblearn.over_sampling.RandomOverSampler` will create exactly the |
| 110 | +# desired number of samples, it does not follow the statistical spirit wanted |
| 111 | +# in the bagging framework. The authors in [3]_ proposes to use a negative |
| 112 | +# binomial distribution to compute the number of samples of the majority |
| 113 | +# class to be selected and then perform a random under-sampling. |
| 114 | +# |
| 115 | +# Here, we illustrate this method by implementing a function in charge of |
| 116 | +# resampling and use the :class:`~imblearn.FunctionSampler` to integrate it |
| 117 | +# within a :class:`~imblearn.pipeline.Pipeline` and |
| 118 | +# :class:`~sklearn.model_selection.cross_validate`. |
109 | 119 |
|
110 | 120 | # %%
|
111 | 121 | from collections import Counter
|
112 | 122 | import numpy as np
|
113 | 123 | from imblearn import FunctionSampler
|
114 | 124 |
|
115 | 125 |
|
116 |
| -def binomial_resampling(X, y): |
| 126 | +def roughly_balanced_bagging(X, y, replace=False): |
| 127 | + """Implementation of Roughly Balanced Bagging for binary problem.""" |
| 128 | + # find the minority and majority classes |
117 | 129 | class_counts = Counter(y)
|
118 | 130 | majority_class = max(class_counts, key=class_counts.get)
|
119 | 131 | minority_class = min(class_counts, key=class_counts.get)
|
120 | 132 |
|
| 133 | + # compute the number of sample to draw from the majority class using |
| 134 | + # a negative binomial distribution |
121 | 135 | n_minority_class = class_counts[minority_class]
|
122 |
| - n_majority_resampled = np.random.negative_binomial(n_minority_class, 0.5) |
| 136 | + n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5) |
123 | 137 |
|
| 138 | + # draw randomly with or without replacement |
124 | 139 | majority_indices = np.random.choice(
|
125 | 140 | np.flatnonzero(y == majority_class),
|
126 | 141 | size=n_majority_resampled,
|
127 |
| - replace=True, |
| 142 | + replace=replace, |
128 | 143 | )
|
129 | 144 | minority_indices = np.random.choice(
|
130 | 145 | np.flatnonzero(y == minority_class),
|
131 | 146 | size=n_minority_class,
|
132 |
| - replace=True, |
| 147 | + replace=replace, |
133 | 148 | )
|
134 | 149 | indices = np.hstack([majority_indices, minority_indices])
|
135 | 150 |
|
136 |
| - X_res, y_res = X[indices], y[indices] |
137 |
| - return X_res, y_res |
| 151 | + return X[indices], y[indices] |
138 | 152 |
|
139 | 153 |
|
140 | 154 | # Roughly Balanced Bagging
|
141 |
| -rbb = BalancedBaggingClassifier(sampler=FunctionSampler(func=binomial_resampling)) |
| 155 | +rbb = BalancedBaggingClassifier( |
| 156 | + sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": True}) |
| 157 | +) |
142 | 158 | cv_results = cross_validate(rbb, X, y, scoring="balanced_accuracy")
|
143 | 159 |
|
144 | 160 | print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}")
|
|
0 commit comments