Skip to content

Commit 96ef7d4

Browse files
committed
DOC effect and comparison to deal with imbalanced classification
1 parent 1b4aa9c commit 96ef7d4

File tree

1 file changed

+82
-0
lines changed

1 file changed

+82
-0
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
"""
2+
========================================================================
3+
Model fitting on imbalanced dataset and comparison of methods to improve
4+
performance
5+
========================================================================
6+
7+
This example illustrates the problem induced by learning on datasets having
8+
imbalanced classes. Subsequently, we compare different approaches alleviating
9+
these negative effects.
10+
11+
"""
12+
13+
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
14+
# License: MIT
15+
16+
print(__doc__)
17+
18+
###############################################################################
19+
# Problem definition
20+
###############################################################################
21+
22+
from sklearn.datasets import fetch_openml
23+
24+
df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True)
25+
# we are dropping the following features:
26+
# - "fnlwgt": this feature was created while studying the "adult" dataset.
27+
# Thus, we will not use this feature which is not acquired during the survey.
28+
# - "education-num": it is encoding the same information than "education".
29+
# Thus, we are removing one of these 2 features.
30+
df = df.drop(columns=['fnlwgt', 'education-num'])
31+
32+
###############################################################################
33+
# The "adult" dataset as a class ratio of about 3:1
34+
35+
from collections import Counter
36+
37+
classes_count = y.value_counts()
38+
print(f"Classes information:\n{classes_count}")
39+
40+
###############################################################################
41+
# This dataset is only slightly imbalanced. To better highlight the effect of
42+
# learning from imbalanced dataset, we will increase this ratio to 30:1
43+
44+
from imblearn.datasets import make_imbalance
45+
46+
ratio = 30
47+
df_res, y_res = make_imbalance(
48+
df, y, sampling_strategy={
49+
classes_count.idxmin(): classes_count.max() // ratio
50+
}
51+
)
52+
53+
###############################################################################
54+
# For the rest of the notebook, we will make a single split to get training
55+
# and testing data. Note that you should use cross-validation to have an
56+
# estimate of the performance variation in practice.
57+
58+
from sklearn.model_selection import train_test_split
59+
60+
X_train, X_test, y_train, y_test = train_test_split(
61+
df_res, y_res, stratify=y_res, random_state=42
62+
)
63+
64+
###############################################################################
65+
# As a baseline, we could use a classifier which will always predict the
66+
# majority class independently of the features provided.
67+
68+
from sklearn.dummy import DummyClassifier
69+
70+
dummy_clf = DummyClassifier(strategy="most_frequent")
71+
score = dummy_clf.fit(X_train, y_train).score(X_test, y_test)
72+
print(f"Accuracy score of a dummy classifier: {score:.3f}")
73+
74+
##############################################################################
75+
# Instead of using the accuracy, we can use the balanced accuracy which will
76+
# take into account the balancing issue.
77+
78+
from sklearn.metrics import balanced_accuracy_score
79+
80+
y_pred = dummy_clf.predict(X_test)
81+
score = balanced_accuracy_score(y_test, y_pred)
82+
print(f"Balanced accuracy score of a dummy classifier: {score:.3f}")

0 commit comments

Comments
 (0)