1
1
"""
2
2
========================================================================
3
3
Model fitting on imbalanced dataset and comparison of methods to improve
4
- performance
4
+ its performance
5
5
========================================================================
6
6
7
7
This example illustrates the problem induced by learning on datasets having
39
39
40
40
###############################################################################
41
41
# This dataset is only slightly imbalanced. To better highlight the effect of
42
- # learning from imbalanced dataset, we will increase this ratio to 30:1
42
+ # learning from an imbalanced dataset, we will increase its ratio to 30:1
43
43
44
44
from imblearn .datasets import make_imbalance
45
45
87
87
###############################################################################
88
88
89
89
###############################################################################
90
- # We will first define an helper function which will train a given model
90
+ # We will first define a helper function which will train a given model
91
91
# and compute both accuracy and balanced accuracy. The results will be stored
92
92
# in a dataframe
93
93
@@ -177,7 +177,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
177
177
178
178
###############################################################################
179
179
# We can see that our linear model is learning slightly better than our dummy
180
- # baseline. However, it is impacted by class imbalanced .
180
+ # baseline. However, it is impacted by the class imbalance .
181
181
#
182
182
# We can verify that something similar is happening with a tree-based model
183
183
# such as `RandomForestClassifier`. With this type of classifier, we will not
@@ -247,7 +247,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
247
247
#
248
248
# Another way is to resample the training set by under-sampling or
249
249
# over-sampling some of the samples. `imbalanced-learn` provides some samplers
250
- # to do such precessing .
250
+ # to do such processing .
251
251
252
252
from imblearn .pipeline import make_pipeline as make_pipeline_with_sampler
253
253
from imblearn .under_sampling import RandomUnderSampler
@@ -277,7 +277,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
277
277
df_scores
278
278
279
279
###############################################################################
280
- # Applying a random under-sampler before to train the linear model or random
280
+ # Applying a random under-sampler before the training of the linear model or random
281
281
# forest, allows to not focus on the majority class at the cost of making more
282
282
# mistake for samples in the majority class (i.e. decreased accuracy).
283
283
#
@@ -290,7 +290,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
290
290
# Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`
291
291
# .......................................................................
292
292
#
293
- # We already show that random under-sampling can be effective on decision tree.
293
+ # We already showed that random under-sampling can be effective on decision tree.
294
294
# However, instead of under-sampling once the dataset, one could under-sample
295
295
# the original dataset before to take a bootstrap sample. This is the base of
296
296
# the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`.
@@ -306,7 +306,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
306
306
df_scores
307
307
308
308
###############################################################################
309
- # The performance with the `BalancedRandomForestClassifier` are better than
309
+ # The performance with the `BalancedRandomForestClassifier` is better than
310
310
# applying a single random under-sampling. We will use a gradient-boosting
311
311
# classifier within a `BalancedBaggingClassifier`.
312
312
@@ -332,7 +332,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
332
332
# to bring some diversity for the different GBDT to learn and not focus on a
333
333
# portion of the majority class.
334
334
#
335
- # We will repeat the same experiment but a ratio of 100:1 and make a similar
335
+ # We will repeat the same experiment but with a ratio of 100:1 and make a similar
336
336
# analysis.
337
337
338
338
###############################################################################
0 commit comments