diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py new file mode 100644 index 000000000..97fcb7131 --- /dev/null +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -0,0 +1,404 @@ +""" +======================================================================== +Model fitting on imbalanced dataset and comparison of methods to improve +its performance +======================================================================== + +This example illustrates the problem induced by learning on datasets having +imbalanced classes. Subsequently, we compare different approaches alleviating +these negative effects. + +""" + +# Authors: Guillaume Lemaitre +# License: MIT + +print(__doc__) + +############################################################################### +# Problem definition +############################################################################### + +from sklearn.datasets import fetch_openml + +df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True) +# we are dropping the following features: +# - "fnlwgt": this feature was created while studying the "adult" dataset. +# Thus, we will not use this feature which is not acquired during the survey. +# - "education-num": it is encoding the same information than "education". +# Thus, we are removing one of these 2 features. +df = df.drop(columns=['fnlwgt', 'education-num']) + +############################################################################### +# The "adult" dataset as a class ratio of about 3:1 + +classes_count = y.value_counts() +classes_count + +############################################################################### +# This dataset is only slightly imbalanced. To better highlight the effect of +# learning from an imbalanced dataset, we will increase its ratio to 30:1 + +from imblearn.datasets import make_imbalance + +ratio = 30 +df_res, y_res = make_imbalance( + df, y, sampling_strategy={ + classes_count.idxmin(): classes_count.max() // ratio + } +) +y_res.value_counts() + +############################################################################### +# For the rest of the notebook, we will make a single split to get training +# and testing data. Note that you should use cross-validation to have an +# estimate of the performance variation in practice. + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split( + df_res, y_res, stratify=y_res, random_state=42 +) + +############################################################################### +# As a baseline, we could use a classifier which will always predict the +# majority class independently of the features provided. + +from sklearn.dummy import DummyClassifier + +dummy_clf = DummyClassifier(strategy="most_frequent") +score = dummy_clf.fit(X_train, y_train).score(X_test, y_test) +print(f"Accuracy score of a dummy classifier: {score:.3f}") + +############################################################################## +# Instead of using the accuracy, we can use the balanced accuracy which will +# take into account the balancing issue. + +from sklearn.metrics import balanced_accuracy_score + +y_pred = dummy_clf.predict(X_test) +score = balanced_accuracy_score(y_test, y_pred) +print(f"Balanced accuracy score of a dummy classifier: {score:.3f}") + +############################################################################### +# Strategies to learn from an imbalanced dataset +############################################################################### + +############################################################################### +# We will first define a helper function which will train a given model +# and compute both accuracy and balanced accuracy. The results will be stored +# in a dataframe + +import pandas as pd + + +def evaluate_classifier(clf, df_scores, clf_name=None): + from sklearn.pipeline import Pipeline + if clf_name is None: + if isinstance(clf, Pipeline): + clf_name = clf[-1].__class__.__name__ + else: + clf_name = clf.__class__.__name__ + acc = clf.fit(X_train, y_train).score(X_test, y_test) + y_pred = clf.predict(X_test) + bal_acc = balanced_accuracy_score(y_test, y_pred) + clf_score = pd.DataFrame( + {clf_name: [acc, bal_acc]}, + index=['Accuracy', 'Balanced accuracy'] + ) + df_scores = pd.concat([df_scores, clf_score], axis=1).round(decimals=3) + return df_scores + + +# Let's define an empty dataframe to store the results +df_scores = pd.DataFrame() + +############################################################################### +# Dummy baseline +# .............. +# +# Before to train a real machine learning model, we can store the results +# obtained with our `DummyClassifier`. + +df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy") +df_scores + +############################################################################### +# Linear classifier baseline +# .......................... +# +# We will create a machine learning pipeline using a `LogisticRegression` +# classifier. In this regard, we will need to one-hot encode the categorical +# columns and standardized the numerical columns before to inject the data into +# the `LogisticRegression` classifier. +# +# First, we define our numerical and categorical pipelines. + +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import OneHotEncoder +from sklearn.pipeline import make_pipeline + +num_pipe = make_pipeline( + StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True) +) +cat_pipe = make_pipeline( + SimpleImputer(strategy="constant", fill_value="missing"), + OneHotEncoder(handle_unknown="ignore") +) + +############################################################################### +# Then, we can create a preprocessor which will dispatch the categorical +# columns to the categorical pipeline and the numerical columns to the +# numerical pipeline + +import numpy as np +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_selector as selector + +preprocessor_linear = ColumnTransformer( + [("num-pipe", num_pipe, selector(dtype_include=np.number)), + ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))], + n_jobs=2 +) + +############################################################################### +# Finally, we connect our preprocessor with our `LogisticRegression`. We can +# then evaluate our model. + +from sklearn.linear_model import LogisticRegression + +lr_clf = make_pipeline( + preprocessor_linear, LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier(lr_clf, df_scores, "LR") +df_scores + +############################################################################### +# We can see that our linear model is learning slightly better than our dummy +# baseline. However, it is impacted by the class imbalance. +# +# We can verify that something similar is happening with a tree-based model +# such as `RandomForestClassifier`. With this type of classifier, we will not +# need to scale the numerical data, and we will only need to ordinal encode the +# categorical data. + +from sklearn.preprocessing import OrdinalEncoder +from sklearn.ensemble import RandomForestClassifier + +cat_pipe = make_pipeline( + SimpleImputer(strategy="constant", fill_value="missing"), + OrdinalEncoder() +) + +preprocessor_tree = ColumnTransformer( + [("num-pipe", num_pipe, selector(dtype_include=np.number)), + ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))], + n_jobs=2 +) + +rf_clf = make_pipeline( + preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2) +) + +df_scores = evaluate_classifier(rf_clf, df_scores, "RF") +df_scores + +############################################################################### +# The `RandomForestClassifier` is as well affected by the class imbalanced, +# slightly less than the linear model. Now, we will present different approach +# to improve the performance of these 2 models. +# +# Use `class_weight` +# .................. +# +# Most of the models in `scikit-learn` have a parameter `class_weight`. This +# parameter will affect the computation of the loss in linear model or the +# criterion in the tree-based model to penalize differently a false +# classification from the minority and majority class. We can set +# `class_weight="balanced"` such that the weight applied is inversely +# proportional to the class frequency. We test this parametrization in both +# linear model and tree-based model. + +lr_clf.set_params(logisticregression__class_weight="balanced") +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with class weight" +) +df_scores + +############################################################################### +# + +rf_clf.set_params(randomforestclassifier__class_weight="balanced") +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with class weight" +) +df_scores + +############################################################################### +# We can see that using `class_weight` was really effective for the linear +# model, alleviating the issue of learning from imbalanced classes. However, +# the `RandomForestClassifier` is still biased toward the majority class, +# mainly due to the criterion which is not suited enough to fight the class +# imbalance. +# +# Resample the training set during learning +# ......................................... +# +# Another way is to resample the training set by under-sampling or +# over-sampling some of the samples. `imbalanced-learn` provides some samplers +# to do such processing. + +from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler +from imblearn.under_sampling import RandomUnderSampler + +lr_clf = make_pipeline_with_sampler( + preprocessor_linear, + RandomUnderSampler(random_state=42), + LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with under-sampling" +) +df_scores + +############################################################################### +# + +rf_clf = make_pipeline_with_sampler( + preprocessor_tree, + RandomUnderSampler(random_state=42), + RandomForestClassifier(random_state=42, n_jobs=2) +) + +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with under-sampling" +) +df_scores + +############################################################################### +# Applying a random under-sampler before the training of the linear model or +# random forest, allows to not focus on the majority class at the cost of +# making more mistake for samples in the majority class (i.e. decreased +# accuracy). +# +# We could apply any type of samplers and find which sampler is working best +# on the current dataset. +# +# Instead, we will present another way by using classifiers which will apply +# sampling internally. +# +# Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier` +# ....................................................................... +# +# We already showed that random under-sampling can be effective on decision +# tree. However, instead of under-sampling once the dataset, one could +# under-sample the original dataset before to take a bootstrap sample. This is +# the base of the `BalancedRandomForestClassifier` and +# `BalancedBaggingClassifier`. + +from imblearn.ensemble import BalancedRandomForestClassifier + +rf_clf = make_pipeline( + preprocessor_tree, + BalancedRandomForestClassifier(random_state=42, n_jobs=2) +) + +df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF") +df_scores + +############################################################################### +# The performance with the `BalancedRandomForestClassifier` is better than +# applying a single random under-sampling. We will use a gradient-boosting +# classifier within a `BalancedBaggingClassifier`. + +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import HistGradientBoostingClassifier +from imblearn.ensemble import BalancedBaggingClassifier + +bag_clf = make_pipeline( + preprocessor_tree, + BalancedBaggingClassifier( + base_estimator=HistGradientBoostingClassifier(random_state=42), + n_estimators=10, random_state=42, n_jobs=2 + ) +) + +df_scores = evaluate_classifier( + bag_clf, df_scores, "Balanced bagging" +) +df_scores + +############################################################################### +# This last approach is the most effective. The different under-sampling allows +# to bring some diversity for the different GBDT to learn and not focus on a +# portion of the majority class. +# +# We will repeat the same experiment but with a ratio of 100:1 and make a +# similar analysis. + +############################################################################### +# Increase imbalanced ratio +############################################################################### + +ratio = 100 +df_res, y_res = make_imbalance( + df, y, sampling_strategy={ + classes_count.idxmin(): classes_count.max() // ratio + } +) +X_train, X_test, y_train, y_test = train_test_split( + df_res, y_res, stratify=y_res, random_state=42 +) + +df_scores = pd.DataFrame() +df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy") +lr_clf = make_pipeline( + preprocessor_linear, LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier(lr_clf, df_scores, "LR") +rf_clf = make_pipeline( + preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2) +) +df_scores = evaluate_classifier(rf_clf, df_scores, "RF") +lr_clf.set_params(logisticregression__class_weight="balanced") +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with class weight" +) +rf_clf.set_params(randomforestclassifier__class_weight="balanced") +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with class weight" +) +lr_clf = make_pipeline_with_sampler( + preprocessor_linear, + RandomUnderSampler(random_state=42), + LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with under-sampling" +) +rf_clf = make_pipeline_with_sampler( + preprocessor_tree, + RandomUnderSampler(random_state=42), + RandomForestClassifier(random_state=42, n_jobs=2) +) +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with under-sampling" +) +rf_clf = make_pipeline( + preprocessor_tree, + BalancedRandomForestClassifier(random_state=42, n_jobs=2) +) +df_scores = evaluate_classifier(rf_clf, df_scores) +df_scores = evaluate_classifier( + bag_clf, df_scores, "Balanced bagging" +) +df_scores + +############################################################################### +# When we analyse the results, we can draw similar conclusions than in the +# previous discussion. However, we can observe that the strategy +# `class_weight="balanced"` does not improve the performance when using a +# `RandomForestClassifier`. A resampling is indeed required. The most effective +# method remains the `BalancedBaggingClassifier` using a GBDT as a base +# learner.