iter

glemaitre · glemaitre · commit 2940dbbfe46a · 2019-11-19T01:37:43.000+01:00
diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
@@ -35,7 +35,7 @@
 from collections import Counter
 
 classes_count = y.value_counts()
-print(f"Classes information:\n{classes_count}")
+classes_count
 
 ###############################################################################
 # This dataset is only slightly imbalanced. To better highlight the effect of
@@ -49,6 +49,7 @@
         classes_count.idxmin(): classes_count.max() // ratio
     }
 )
+y_res.value_counts()
 
 ###############################################################################
 # For the rest of the notebook, we will make a single split to get training
@@ -80,3 +81,314 @@
 y_pred = dummy_clf.predict(X_test)
 score = balanced_accuracy_score(y_test, y_pred)
 print(f"Balanced accuracy score of a dummy classifier: {score:.3f}")
+
+###############################################################################
+# Strategies to learn from an imbalanced dataset
+###############################################################################
+
+###############################################################################
+# We will first define an helper function which will train a given model
+# and compute both accuracy and balanced accuracy. The results will be stored
+# in a dataframe
+
+import pandas as pd
+
+
+def evaluate_classifier(clf, df_scores, clf_name=None):
+    from sklearn.pipeline import Pipeline
+    if clf_name is None:
+        if isinstance(clf, Pipeline):
+            clf_name = clf[-1].__class__.__name__
+        else:
+            clf_name = clf.__class__.__name__
+    acc = clf.fit(X_train, y_train).score(X_test, y_test)
+    y_pred = clf.predict(X_test)
+    bal_acc = balanced_accuracy_score(y_test, y_pred)
+    clf_score = pd.DataFrame(
+        {clf_name: [acc, bal_acc]},
+        index=['Accuracy', 'Balanced accuracy']
+    )
+    df_scores = pd.concat([df_scores, clf_score], axis=1).round(decimals=3)
+    return df_scores
+
+
+# Let's define an empty dataframe to store the results
+df_scores = pd.DataFrame()
+
+###############################################################################
+# Dummy baseline
+# ..............
+#
+# Before to train a real machine learning model, we can store the results
+# obtained with our `DummyClassifier`.
+
+df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy")
+df_scores
+
+###############################################################################
+# Linear classifier baseline
+# ..........................
+#
+# We will create a machine learning pipeline using a `LogisticRegression`
+# classifier. In this regard, we will need to one-hot encode the categorical
+# columns and standardized the numerical columns before to inject the data into
+# the `LogisticRegression` classifier.
+#
+# First, we define our numerical and categorical pipelines.
+
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.pipeline import make_pipeline
+
+num_pipe = make_pipeline(
+    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
+)
+cat_pipe = make_pipeline(
+    SimpleImputer(strategy="constant", fill_value="missing"),
+    OneHotEncoder(handle_unknown="ignore")
+)
+
+###############################################################################
+# Then, we can create a preprocessor which will dispatch the categorical
+# columns to the categorical pipeline and the numerical columns to the
+# numerical pipeline
+
+import numpy as np
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_selector as selector
+
+preprocessor_linear = ColumnTransformer(
+    [("num-pipe", num_pipe, selector(dtype_include=np.number)),
+     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))]
+)
+
+###############################################################################
+# Finally, we connect our preprocessor with our `LogisticRegression`. We can
+# then evaluate our model.
+
+from sklearn.linear_model import LogisticRegression
+
+lr_clf = make_pipeline(
+    preprocessor_linear, LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(lr_clf, df_scores, "LR")
+df_scores
+
+###############################################################################
+# We can see that our linear model is learning slightly better than our dummy
+# baseline. However, it is impacted by class imbalanced.
+#
+# We can verify that something similar is happening with a tree-based model
+# such as `RandomForestClassifier`. With this type of classifier, we will not
+# need to scale the numerical data, and we will only need to ordinal encode the
+# categorical data.
+
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.ensemble import RandomForestClassifier
+
+cat_pipe = make_pipeline(
+    SimpleImputer(strategy="constant", fill_value="missing"),
+    OrdinalEncoder()
+)
+
+preprocessor_tree = ColumnTransformer(
+    [("num-pipe", num_pipe, selector(dtype_include=np.number)),
+     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))]
+)
+
+rf_clf = make_pipeline(
+    preprocessor_tree, RandomForestClassifier(random_state=42)
+)
+
+df_scores = evaluate_classifier(rf_clf, df_scores, "RF")
+df_scores
+
+###############################################################################
+# The `RandomForestClassifier` is as well affected by the class imbalanced,
+# slightly less than the linear model. Now, we will present different approach
+# to improve the performance of these 2 models.
+#
+# Use `class_weight`
+# ..................
+#
+# Most of the models in `scikit-learn` have a parameter `class_weight`. This
+# parameter will affect the computation of the loss in linear model or the
+# criterion in the tree-based model to penalize differently a false
+# classification from the minority and majority class. We can set
+# `class_weight="balanced"` such that the weight applied is inversely
+# proportional to the class frequency. We test this parametrization in both
+# linear model and tree-based model.
+
+lr_clf.set_params(logisticregression__class_weight="balanced")
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with class weight"
+)
+df_scores
+
+###############################################################################
+#
+
+rf_clf.set_params(randomforestclassifier__class_weight="balanced")
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with class weight"
+)
+df_scores
+
+###############################################################################
+# We can see that using `class_weight` was really effective for the linear
+# model, alleviating the issue of learning from imbalanced classes. However,
+# the `RandomForestClassifier` is still biased toward the majority class,
+# mainly due to the criterion which is not suited enough to fight the class
+# imbalance.
+#
+# Resample the training set during learning
+# .........................................
+#
+# Another way is to resample the training set by under-sampling or
+# over-sampling some of the samples. `imbalanced-learn` provides some samplers
+# to do such precessing.
+
+from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
+from imblearn.under_sampling import RandomUnderSampler
+
+lr_clf = make_pipeline_with_sampler(
+    preprocessor_linear,
+    RandomUnderSampler(random_state=42),
+    LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with under-sampling"
+)
+df_scores
+
+###############################################################################
+#
+
+rf_clf = make_pipeline_with_sampler(
+    preprocessor_tree,
+    RandomUnderSampler(random_state=42),
+    RandomForestClassifier(random_state=42)
+)
+
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with under-sampling"
+)
+df_scores
+
+###############################################################################
+# Applying a random under-sampler before to train the linear model or random
+# forest, allows to not focus on the majority class at the cost of making more
+# mistake for samples in the majority class (i.e. decreased accuracy).
+#
+# We could apply any type of samplers and find which sampler is working best
+# on the current dataset.
+#
+# Instead, we will present another way by using classifiers which will apply
+# sampling internally.
+#
+# Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`
+# .......................................................................
+#
+# We already show that random under-sampling can be effective on decision tree.
+# However, instead of under-sampling once the dataset, one could under-sample
+# the original dataset before to take a bootstrap sample. This is the base of
+# the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`.
+
+from imblearn.ensemble import BalancedRandomForestClassifier
+
+rf_clf = make_pipeline(
+    preprocessor_tree,
+    BalancedRandomForestClassifier(random_state=42)
+)
+
+df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF")
+df_scores
+
+###############################################################################
+# The performance with the `BalancedRandomForestClassifier` are better than
+# applying a single random under-sampling. We will use a gradient-boosting
+# classifier within a `BalancedBaggingClassifier`.
+
+from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import HistGradientBoostingClassifier
+from imblearn.ensemble import BalancedBaggingClassifier
+
+bag_clf = make_pipeline(
+    preprocessor_tree,
+    BalancedBaggingClassifier(
+        base_estimator=HistGradientBoostingClassifier(random_state=42),
+        n_estimators=10, random_state=42
+    )
+)
+
+df_scores = evaluate_classifier(
+    bag_clf, df_scores, "Balanced bagging"
+)
+df_scores
+
+###############################################################################
+# This last approach is the most effective. The different under-sampling allows
+# to bring some diversity for the different GBDT to learn and not focus on a
+# portion of the majority class.
+#
+# We will repeat the same experiment but a ratio of 100:1 and make a similar
+# analysis.
+
+###############################################################################
+# Increase imbalanced ratio
+###############################################################################
+
+ratio = 100
+df_res, y_res = make_imbalance(
+    df, y, sampling_strategy={
+        classes_count.idxmin(): classes_count.max() // ratio
+    }
+)
+X_train, X_test, y_train, y_test = train_test_split(
+    df_res, y_res, stratify=y_res, random_state=42
+)
+
+df_scores = pd.DataFrame()
+df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy")
+lr_clf = make_pipeline(
+    preprocessor_linear, LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(lr_clf, df_scores, "LR")
+rf_clf = make_pipeline(
+    preprocessor_tree, RandomForestClassifier(random_state=42)
+)
+df_scores = evaluate_classifier(rf_clf, df_scores, "RF")
+lr_clf.set_params(logisticregression__class_weight="balanced")
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with class weight"
+)
+rf_clf.set_params(randomforestclassifier__class_weight="balanced")
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with class weight"
+)
+lr_clf = make_pipeline_with_sampler(
+    preprocessor_linear,
+    RandomUnderSampler(random_state=42),
+    LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with under-sampling"
+)
+rf_clf = make_pipeline_with_sampler(
+    preprocessor_tree,
+    RandomUnderSampler(random_state=42),
+    RandomForestClassifier(random_state=42)
+)
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with under-sampling"
+)
+rf_clf = make_pipeline(
+    preprocessor_tree,
+    BalancedRandomForestClassifier(random_state=42)
+)
+df_scores = evaluate_classifier(rf_clf, df_scores)
+df_scores = evaluate_classifier(
+    bag_clf, df_scores, "Balanced bagging"
+)
+df_scores