From 77d3dec65bc08c3e2e95e5b166063882af7f0bcd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 17 Nov 2019 23:32:53 +0100
Subject: [PATCH 01/11] ENH Allows pandas series in/out

---
 doc/introduction.rst                          |  2 +-
 doc/whats_new/v0.6.rst                        |  3 +-
 imblearn/base.py                              | 71 ++++++++++++++-----
 .../over_sampling/_random_over_sampler.py     | 18 ++++-
 .../_random_under_sampler.py                  | 18 ++++-
 imblearn/utils/estimator_checks.py            |  7 +-
 6 files changed, 92 insertions(+), 27 deletions(-)
diff --git a/doc/introduction.rst b/doc/introduction.rst
index 1ea72326f..a4184abb2 100644
--- a/doc/introduction.rst
+++ b/doc/introduction.rst
@@ -34,7 +34,7 @@ The output will be of the following type:
 
 * ``data_resampled``: array-like (2-D list, pandas.Dataframe, numpy.array) or
    sparse matrices;
-   * ``targets_resampled``: 1-D numpy.array.
+   * ``targets_resampled``: 1-D numpy.array or pd.Series.
 
 .. topic:: Sparse input
 
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
index a130fb301..223f68551 100644
--- a/doc/whats_new/v0.6.rst
+++ b/doc/whats_new/v0.6.rst
@@ -57,7 +57,8 @@ Enhancement
 - :class:`imblearn.under_sampling.RandomUnderSampling`,
   :class:`imblearn.over_sampling.RandomOverSampling`,
   :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and
-  will output Pandas DataFrame.
+  will output Pandas DataFrame. Similarly, it will accepts Pandas Series in and
+  will output Pandas Series.
   :pr:`636` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 - :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing
diff --git a/imblearn/base.py b/imblearn/base.py
index b182f3e72..5ae136c39 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -80,20 +80,28 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        if self._columns is not None:
+        if self._X_columns is not None or self._y_name is not None:
             import pandas as pd
-            X_ = pd.DataFrame(output[0], columns=self._columns)
+
+        if self._X_columns is not None:
+            X_ = pd.DataFrame(output[0], columns=self._X_columns)
+            X_ = X_.astype(self._X_dtypes)
         else:
             X_ = output[0]
 
+        y_ = (label_binarize(output[1], np.unique(y))
+              if binarize_y else output[1])
+
+        if self._y_name is not None:
+            y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
+
         if binarize_y:
-            y_sampled = label_binarize(output[1], np.unique(y))
             if len(output) == 2:
-                return X_, y_sampled
-            return X_, y_sampled, output[2]
+                return X_, y_
+            return X_, y_, output[2]
         if len(output) == 2:
-            return X_, output[1]
-        return X_, output[1], output[2]
+            return X_, y_
+        return X_, y_, output[2]
 
     #  define an alias for back-compatibility
     fit_sample = fit_resample
@@ -135,8 +143,22 @@ def __init__(self, sampling_strategy="auto"):
         self.sampling_strategy = sampling_strategy
 
     def _check_X_y(self, X, y, accept_sparse=None):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         if accept_sparse is None:
             accept_sparse = ["csr", "csc"]
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
@@ -263,20 +285,31 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        if self._columns is not None:
-            import pandas as pd
-            X_ = pd.DataFrame(output[0], columns=self._columns)
+        if self.validate:
+            if self._X_columns is not None or self._y_name is not None:
+                import pandas as pd
+
+            if self._X_columns is not None:
+                X_ = pd.DataFrame(output[0], columns=self._X_columns)
+                X_ = X_.astype(self._X_dtypes)
+            else:
+                X_ = output[0]
+
+            y_ = (label_binarize(output[1], np.unique(y))
+                if binarize_y else output[1])
+
+            if self._y_name is not None:
+                y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
         else:
-            X_ = output[0]
+            X_, y_ = output[0], output[1]
 
-        if self.validate and binarize_y:
-            y_sampled = label_binarize(output[1], np.unique(y))
+        if binarize_y:
             if len(output) == 2:
-                return X_, y_sampled
-            return X_, y_sampled, output[2]
+                return X_, y_
+            return X_, y_, output[2]
         if len(output) == 2:
-            return X_, output[1]
-        return X_, output[1], output[2]
+            return X_, y_
+        return X_, y_, output[2]
 
     def _fit_resample(self, X, y):
         func = _identity if self.func is None else self.func
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
index ea8b4d18b..afcb89da5 100644
--- a/imblearn/over_sampling/_random_over_sampler.py
+++ b/imblearn/over_sampling/_random_over_sampler.py
@@ -75,8 +75,22 @@ def __init__(self, sampling_strategy="auto", random_state=None):
         self.random_state = random_state
 
     def _check_X_y(self, X, y):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
index 6301822ea..8d7c08c93 100644
--- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -81,8 +81,22 @@ def __init__(
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 4fef2a13b..43f117ba3 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -242,6 +242,7 @@ def check_samplers_pandas(name, Sampler):
         random_state=0,
     )
     X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
+    y_pd = pd.Series(y, name="class")
     sampler = Sampler()
     if isinstance(Sampler(), NearMiss):
         samplers = [Sampler(version=version) for version in (1, 2, 3)]
@@ -251,14 +252,16 @@ def check_samplers_pandas(name, Sampler):
 
     for sampler in samplers:
         set_random_state(sampler)
-        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y)
+        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
         X_res, y_res = sampler.fit_resample(X, y)
 
         # check that we return a pandas dataframe if a dataframe was given in
         assert isinstance(X_res_pd, pd.DataFrame)
+        assert isinstance(y_res_pd, pd.Series)
         assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
+        assert y_pd.name == y_res_pd.name
         assert_allclose(X_res_pd.to_numpy(), X_res)
-        assert_allclose(y_res_pd, y_res)
+        assert_allclose(y_res_pd.to_numpy(), y_res)
 
 
 def check_samplers_multiclass_ova(name, Sampler):

From b63e7ce99867c8d3660da4a5f2b1141bdf3a990f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 17 Nov 2019 23:34:35 +0100
Subject: [PATCH 02/11] PEP8

---
 imblearn/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 5ae136c39..43934b880 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -296,7 +296,7 @@ def fit_resample(self, X, y):
                 X_ = output[0]
 
             y_ = (label_binarize(output[1], np.unique(y))
-                if binarize_y else output[1])
+                  if binarize_y else output[1])
 
             if self._y_name is not None:
                 y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)

From c4777f5556a7533dd213c98a84de4c78a114a076 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 17 Nov 2019 23:46:34 +0100
Subject: [PATCH 03/11] fix

---
 imblearn/base.py                 | 10 +++++-----
 imblearn/over_sampling/_smote.py | 18 ++++++++++++++++--
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index 43934b880..e86052715 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -300,16 +300,16 @@ def fit_resample(self, X, y):
 
             if self._y_name is not None:
                 y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
+
+            if binarize_y:
+                if len(output) == 2:
+                    return X_, y_
+                return X_, y_, output[2]
         else:
             X_, y_ = output[0], output[1]
-
-        if binarize_y:
             if len(output) == 2:
                 return X_, y_
             return X_, y_, output[2]
-        if len(output) == 2:
-            return X_, y_
-        return X_, y_, output[2]
 
     def _fit_resample(self, X, y):
         func = _identity if self.func is None else self.func
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
index e109ebe4c..b764da6b6 100644
--- a/imblearn/over_sampling/_smote.py
+++ b/imblearn/over_sampling/_smote.py
@@ -892,8 +892,22 @@ def _check_X_y(self, X, y):
         """Overwrite the checking to let pass some string for categorical
         features.
         """
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
         return X, y, binarize_y

From 1b4aa9c144e7be2c6dc3219f50af19d444be4049 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 18 Nov 2019 00:06:11 +0100
Subject: [PATCH 04/11] iter

---
 imblearn/base.py | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/imblearn/base.py b/imblearn/base.py
index e86052715..c5d6b0185 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -95,13 +95,7 @@ def fit_resample(self, X, y):
         if self._y_name is not None:
             y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
 
-        if binarize_y:
-            if len(output) == 2:
-                return X_, y_
-            return X_, y_, output[2]
-        if len(output) == 2:
-            return X_, y_
-        return X_, y_, output[2]
+        return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
 
     #  define an alias for back-compatibility
     fit_sample = fit_resample
@@ -301,15 +295,8 @@ def fit_resample(self, X, y):
             if self._y_name is not None:
                 y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
 
-            if binarize_y:
-                if len(output) == 2:
-                    return X_, y_
-                return X_, y_, output[2]
-        else:
-            X_, y_ = output[0], output[1]
-            if len(output) == 2:
-                return X_, y_
-            return X_, y_, output[2]
+            return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
+        return output
 
     def _fit_resample(self, X, y):
         func = _identity if self.func is None else self.func

From 96ef7d45f9c4d7679f10a0da8b8cab6b0200d28b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 18 Nov 2019 00:10:48 +0100
Subject: [PATCH 05/11] DOC effect and comparison to deal with imbalanced
 classification

---
 .../plot_impact_imbalanced_classes.py         | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 examples/applications/plot_impact_imbalanced_classes.py

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
new file mode 100644
index 000000000..267c5a09e
--- /dev/null
+++ b/examples/applications/plot_impact_imbalanced_classes.py
@@ -0,0 +1,82 @@
+"""
+========================================================================
+Model fitting on imbalanced dataset and comparison of methods to improve
+performance
+========================================================================
+
+This example illustrates the problem induced by learning on datasets having
+imbalanced classes. Subsequently, we compare different approaches alleviating
+these negative effects.
+
+"""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: MIT
+
+print(__doc__)
+
+###############################################################################
+# Problem definition
+###############################################################################
+
+from sklearn.datasets import fetch_openml
+
+df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True)
+# we are dropping the following features:
+# - "fnlwgt": this feature was created while studying the "adult" dataset.
+#   Thus, we will not use this feature which is not acquired during the survey.
+# - "education-num": it is encoding the same information than "education".
+#   Thus, we are removing one of these 2 features.
+df = df.drop(columns=['fnlwgt', 'education-num'])
+
+###############################################################################
+# The "adult" dataset as a class ratio of about 3:1
+
+from collections import Counter
+
+classes_count = y.value_counts()
+print(f"Classes information:\n{classes_count}")
+
+###############################################################################
+# This dataset is only slightly imbalanced. To better highlight the effect of
+# learning from imbalanced dataset, we will increase this ratio to 30:1
+
+from imblearn.datasets import make_imbalance
+
+ratio = 30
+df_res, y_res = make_imbalance(
+    df, y, sampling_strategy={
+        classes_count.idxmin(): classes_count.max() // ratio
+    }
+)
+
+###############################################################################
+# For the rest of the notebook, we will make a single split to get training
+# and testing data. Note that you should use cross-validation to have an
+# estimate of the performance variation in practice.
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(
+    df_res, y_res, stratify=y_res, random_state=42
+)
+
+###############################################################################
+# As a baseline, we could use a classifier which will always predict the
+# majority class independently of the features provided.
+
+from sklearn.dummy import DummyClassifier
+
+dummy_clf = DummyClassifier(strategy="most_frequent")
+score = dummy_clf.fit(X_train, y_train).score(X_test, y_test)
+print(f"Accuracy score of a dummy classifier: {score:.3f}")
+
+##############################################################################
+# Instead of using the accuracy, we can use the balanced accuracy which will
+# take into account the balancing issue.
+
+from sklearn.metrics import balanced_accuracy_score
+
+y_pred = dummy_clf.predict(X_test)
+score = balanced_accuracy_score(y_test, y_pred)
+print(f"Balanced accuracy score of a dummy classifier: {score:.3f}")

From 2940dbbfe46a088343b948492d7c99b25dd7ef70 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Nov 2019 01:37:43 +0100
Subject: [PATCH 06/11] iter

---
 .../plot_impact_imbalanced_classes.py         | 314 +++++++++++++++++-
 1 file changed, 313 insertions(+), 1 deletion(-)

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
index 267c5a09e..9b72139ee 100644
--- a/examples/applications/plot_impact_imbalanced_classes.py
+++ b/examples/applications/plot_impact_imbalanced_classes.py
@@ -35,7 +35,7 @@
 from collections import Counter
 
 classes_count = y.value_counts()
-print(f"Classes information:\n{classes_count}")
+classes_count
 
 ###############################################################################
 # This dataset is only slightly imbalanced. To better highlight the effect of
@@ -49,6 +49,7 @@
         classes_count.idxmin(): classes_count.max() // ratio
     }
 )
+y_res.value_counts()
 
 ###############################################################################
 # For the rest of the notebook, we will make a single split to get training
@@ -80,3 +81,314 @@
 y_pred = dummy_clf.predict(X_test)
 score = balanced_accuracy_score(y_test, y_pred)
 print(f"Balanced accuracy score of a dummy classifier: {score:.3f}")
+
+###############################################################################
+# Strategies to learn from an imbalanced dataset
+###############################################################################
+
+###############################################################################
+# We will first define an helper function which will train a given model
+# and compute both accuracy and balanced accuracy. The results will be stored
+# in a dataframe
+
+import pandas as pd
+
+
+def evaluate_classifier(clf, df_scores, clf_name=None):
+    from sklearn.pipeline import Pipeline
+    if clf_name is None:
+        if isinstance(clf, Pipeline):
+            clf_name = clf[-1].__class__.__name__
+        else:
+            clf_name = clf.__class__.__name__
+    acc = clf.fit(X_train, y_train).score(X_test, y_test)
+    y_pred = clf.predict(X_test)
+    bal_acc = balanced_accuracy_score(y_test, y_pred)
+    clf_score = pd.DataFrame(
+        {clf_name: [acc, bal_acc]},
+        index=['Accuracy', 'Balanced accuracy']
+    )
+    df_scores = pd.concat([df_scores, clf_score], axis=1).round(decimals=3)
+    return df_scores
+
+
+# Let's define an empty dataframe to store the results
+df_scores = pd.DataFrame()
+
+###############################################################################
+# Dummy baseline
+# ..............
+#
+# Before to train a real machine learning model, we can store the results
+# obtained with our `DummyClassifier`.
+
+df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy")
+df_scores
+
+###############################################################################
+# Linear classifier baseline
+# ..........................
+#
+# We will create a machine learning pipeline using a `LogisticRegression`
+# classifier. In this regard, we will need to one-hot encode the categorical
+# columns and standardized the numerical columns before to inject the data into
+# the `LogisticRegression` classifier.
+#
+# First, we define our numerical and categorical pipelines.
+
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.pipeline import make_pipeline
+
+num_pipe = make_pipeline(
+    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
+)
+cat_pipe = make_pipeline(
+    SimpleImputer(strategy="constant", fill_value="missing"),
+    OneHotEncoder(handle_unknown="ignore")
+)
+
+###############################################################################
+# Then, we can create a preprocessor which will dispatch the categorical
+# columns to the categorical pipeline and the numerical columns to the
+# numerical pipeline
+
+import numpy as np
+from sklearn.compose import ColumnTransformer
+from sklearn.compose import make_column_selector as selector
+
+preprocessor_linear = ColumnTransformer(
+    [("num-pipe", num_pipe, selector(dtype_include=np.number)),
+     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))]
+)
+
+###############################################################################
+# Finally, we connect our preprocessor with our `LogisticRegression`. We can
+# then evaluate our model.
+
+from sklearn.linear_model import LogisticRegression
+
+lr_clf = make_pipeline(
+    preprocessor_linear, LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(lr_clf, df_scores, "LR")
+df_scores
+
+###############################################################################
+# We can see that our linear model is learning slightly better than our dummy
+# baseline. However, it is impacted by class imbalanced.
+#
+# We can verify that something similar is happening with a tree-based model
+# such as `RandomForestClassifier`. With this type of classifier, we will not
+# need to scale the numerical data, and we will only need to ordinal encode the
+# categorical data.
+
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.ensemble import RandomForestClassifier
+
+cat_pipe = make_pipeline(
+    SimpleImputer(strategy="constant", fill_value="missing"),
+    OrdinalEncoder()
+)
+
+preprocessor_tree = ColumnTransformer(
+    [("num-pipe", num_pipe, selector(dtype_include=np.number)),
+     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))]
+)
+
+rf_clf = make_pipeline(
+    preprocessor_tree, RandomForestClassifier(random_state=42)
+)
+
+df_scores = evaluate_classifier(rf_clf, df_scores, "RF")
+df_scores
+
+###############################################################################
+# The `RandomForestClassifier` is as well affected by the class imbalanced,
+# slightly less than the linear model. Now, we will present different approach
+# to improve the performance of these 2 models.
+#
+# Use `class_weight`
+# ..................
+#
+# Most of the models in `scikit-learn` have a parameter `class_weight`. This
+# parameter will affect the computation of the loss in linear model or the
+# criterion in the tree-based model to penalize differently a false
+# classification from the minority and majority class. We can set
+# `class_weight="balanced"` such that the weight applied is inversely
+# proportional to the class frequency. We test this parametrization in both
+# linear model and tree-based model.
+
+lr_clf.set_params(logisticregression__class_weight="balanced")
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with class weight"
+)
+df_scores
+
+###############################################################################
+#
+
+rf_clf.set_params(randomforestclassifier__class_weight="balanced")
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with class weight"
+)
+df_scores
+
+###############################################################################
+# We can see that using `class_weight` was really effective for the linear
+# model, alleviating the issue of learning from imbalanced classes. However,
+# the `RandomForestClassifier` is still biased toward the majority class,
+# mainly due to the criterion which is not suited enough to fight the class
+# imbalance.
+#
+# Resample the training set during learning
+# .........................................
+#
+# Another way is to resample the training set by under-sampling or
+# over-sampling some of the samples. `imbalanced-learn` provides some samplers
+# to do such precessing.
+
+from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
+from imblearn.under_sampling import RandomUnderSampler
+
+lr_clf = make_pipeline_with_sampler(
+    preprocessor_linear,
+    RandomUnderSampler(random_state=42),
+    LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with under-sampling"
+)
+df_scores
+
+###############################################################################
+#
+
+rf_clf = make_pipeline_with_sampler(
+    preprocessor_tree,
+    RandomUnderSampler(random_state=42),
+    RandomForestClassifier(random_state=42)
+)
+
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with under-sampling"
+)
+df_scores
+
+###############################################################################
+# Applying a random under-sampler before to train the linear model or random
+# forest, allows to not focus on the majority class at the cost of making more
+# mistake for samples in the majority class (i.e. decreased accuracy).
+#
+# We could apply any type of samplers and find which sampler is working best
+# on the current dataset.
+#
+# Instead, we will present another way by using classifiers which will apply
+# sampling internally.
+#
+# Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`
+# .......................................................................
+#
+# We already show that random under-sampling can be effective on decision tree.
+# However, instead of under-sampling once the dataset, one could under-sample
+# the original dataset before to take a bootstrap sample. This is the base of
+# the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`.
+
+from imblearn.ensemble import BalancedRandomForestClassifier
+
+rf_clf = make_pipeline(
+    preprocessor_tree,
+    BalancedRandomForestClassifier(random_state=42)
+)
+
+df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF")
+df_scores
+
+###############################################################################
+# The performance with the `BalancedRandomForestClassifier` are better than
+# applying a single random under-sampling. We will use a gradient-boosting
+# classifier within a `BalancedBaggingClassifier`.
+
+from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import HistGradientBoostingClassifier
+from imblearn.ensemble import BalancedBaggingClassifier
+
+bag_clf = make_pipeline(
+    preprocessor_tree,
+    BalancedBaggingClassifier(
+        base_estimator=HistGradientBoostingClassifier(random_state=42),
+        n_estimators=10, random_state=42
+    )
+)
+
+df_scores = evaluate_classifier(
+    bag_clf, df_scores, "Balanced bagging"
+)
+df_scores
+
+###############################################################################
+# This last approach is the most effective. The different under-sampling allows
+# to bring some diversity for the different GBDT to learn and not focus on a
+# portion of the majority class.
+#
+# We will repeat the same experiment but a ratio of 100:1 and make a similar
+# analysis.
+
+###############################################################################
+# Increase imbalanced ratio
+###############################################################################
+
+ratio = 100
+df_res, y_res = make_imbalance(
+    df, y, sampling_strategy={
+        classes_count.idxmin(): classes_count.max() // ratio
+    }
+)
+X_train, X_test, y_train, y_test = train_test_split(
+    df_res, y_res, stratify=y_res, random_state=42
+)
+
+df_scores = pd.DataFrame()
+df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy")
+lr_clf = make_pipeline(
+    preprocessor_linear, LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(lr_clf, df_scores, "LR")
+rf_clf = make_pipeline(
+    preprocessor_tree, RandomForestClassifier(random_state=42)
+)
+df_scores = evaluate_classifier(rf_clf, df_scores, "RF")
+lr_clf.set_params(logisticregression__class_weight="balanced")
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with class weight"
+)
+rf_clf.set_params(randomforestclassifier__class_weight="balanced")
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with class weight"
+)
+lr_clf = make_pipeline_with_sampler(
+    preprocessor_linear,
+    RandomUnderSampler(random_state=42),
+    LogisticRegression(max_iter=1000)
+)
+df_scores = evaluate_classifier(
+    lr_clf, df_scores, "LR with under-sampling"
+)
+rf_clf = make_pipeline_with_sampler(
+    preprocessor_tree,
+    RandomUnderSampler(random_state=42),
+    RandomForestClassifier(random_state=42)
+)
+df_scores = evaluate_classifier(
+    rf_clf, df_scores, "RF with under-sampling"
+)
+rf_clf = make_pipeline(
+    preprocessor_tree,
+    BalancedRandomForestClassifier(random_state=42)
+)
+df_scores = evaluate_classifier(rf_clf, df_scores)
+df_scores = evaluate_classifier(
+    bag_clf, df_scores, "Balanced bagging"
+)
+df_scores

From 4d0a62982d3bb3865c1e6e5c98302858fb3019b3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Nov 2019 16:22:50 +0100
Subject: [PATCH 07/11] Apply suggestions from code review

Co-Authored-By: Christos Aridas <chkoar@users.noreply.github.com>
---
 .../plot_impact_imbalanced_classes.py          | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
index 9b72139ee..4fa22f5cc 100644
--- a/examples/applications/plot_impact_imbalanced_classes.py
+++ b/examples/applications/plot_impact_imbalanced_classes.py
@@ -1,7 +1,7 @@
 """
 ========================================================================
 Model fitting on imbalanced dataset and comparison of methods to improve
-performance
+its performance
 ========================================================================
 
 This example illustrates the problem induced by learning on datasets having
@@ -39,7 +39,7 @@
 
 ###############################################################################
 # This dataset is only slightly imbalanced. To better highlight the effect of
-# learning from imbalanced dataset, we will increase this ratio to 30:1
+# learning from an imbalanced dataset, we will increase its ratio to 30:1
 
 from imblearn.datasets import make_imbalance
 
@@ -87,7 +87,7 @@
 ###############################################################################
 
 ###############################################################################
-# We will first define an helper function which will train a given model
+# We will first define a helper function which will train a given model
 # and compute both accuracy and balanced accuracy. The results will be stored
 # in a dataframe
 
@@ -177,7 +177,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 
 ###############################################################################
 # We can see that our linear model is learning slightly better than our dummy
-# baseline. However, it is impacted by class imbalanced.
+# baseline. However, it is impacted by the class imbalance.
 #
 # We can verify that something similar is happening with a tree-based model
 # such as `RandomForestClassifier`. With this type of classifier, we will not
@@ -247,7 +247,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 #
 # Another way is to resample the training set by under-sampling or
 # over-sampling some of the samples. `imbalanced-learn` provides some samplers
-# to do such precessing.
+# to do such processing.
 
 from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler
 from imblearn.under_sampling import RandomUnderSampler
@@ -277,7 +277,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 df_scores
 
 ###############################################################################
-# Applying a random under-sampler before to train the linear model or random
+# Applying a random under-sampler before the training of the linear model or random
 # forest, allows to not focus on the majority class at the cost of making more
 # mistake for samples in the majority class (i.e. decreased accuracy).
 #
@@ -290,7 +290,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 # Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`
 # .......................................................................
 #
-# We already show that random under-sampling can be effective on decision tree.
+# We already showed that random under-sampling can be effective on decision tree.
 # However, instead of under-sampling once the dataset, one could under-sample
 # the original dataset before to take a bootstrap sample. This is the base of
 # the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`.
@@ -306,7 +306,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 df_scores
 
 ###############################################################################
-# The performance with the `BalancedRandomForestClassifier` are better than
+# The performance with the `BalancedRandomForestClassifier` is better than
 # applying a single random under-sampling. We will use a gradient-boosting
 # classifier within a `BalancedBaggingClassifier`.
 
@@ -332,7 +332,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 # to bring some diversity for the different GBDT to learn and not focus on a
 # portion of the majority class.
 #
-# We will repeat the same experiment but a ratio of 100:1 and make a similar
+# We will repeat the same experiment but with a ratio of 100:1 and make a similar
 # analysis.
 
 ###############################################################################

From db914bf066ed525af7b9a4ebcc66e0325da47f5d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Nov 2019 16:41:32 +0100
Subject: [PATCH 08/11] Update plot_impact_imbalanced_classes.py

---
 examples/applications/plot_impact_imbalanced_classes.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
index 4fa22f5cc..cd4b92e62 100644
--- a/examples/applications/plot_impact_imbalanced_classes.py
+++ b/examples/applications/plot_impact_imbalanced_classes.py
@@ -32,8 +32,6 @@
 ###############################################################################
 # The "adult" dataset as a class ratio of about 3:1
 
-from collections import Counter
-
 classes_count = y.value_counts()
 classes_count
 

From 802cc446f3c71f94a4e7041d33a7ae00b402b08c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Nov 2019 22:49:00 +0100
Subject: [PATCH 09/11] increase n_jobs

---
 .../plot_impact_imbalanced_classes.py         | 40 ++++++++++---------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
index cd4b92e62..5ce1fb14b 100644
--- a/examples/applications/plot_impact_imbalanced_classes.py
+++ b/examples/applications/plot_impact_imbalanced_classes.py
@@ -158,7 +158,8 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 
 preprocessor_linear = ColumnTransformer(
     [("num-pipe", num_pipe, selector(dtype_include=np.number)),
-     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))]
+     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))],
+    n_jobs=2
 )
 
 ###############################################################################
@@ -192,11 +193,12 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 
 preprocessor_tree = ColumnTransformer(
     [("num-pipe", num_pipe, selector(dtype_include=np.number)),
-     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))]
+     ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))],
+    n_jobs=2
 )
 
 rf_clf = make_pipeline(
-    preprocessor_tree, RandomForestClassifier(random_state=42)
+    preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2)
 )
 
 df_scores = evaluate_classifier(rf_clf, df_scores, "RF")
@@ -266,7 +268,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 rf_clf = make_pipeline_with_sampler(
     preprocessor_tree,
     RandomUnderSampler(random_state=42),
-    RandomForestClassifier(random_state=42)
+    RandomForestClassifier(random_state=42, n_jobs=2)
 )
 
 df_scores = evaluate_classifier(
@@ -275,9 +277,10 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 df_scores
 
 ###############################################################################
-# Applying a random under-sampler before the training of the linear model or random
-# forest, allows to not focus on the majority class at the cost of making more
-# mistake for samples in the majority class (i.e. decreased accuracy).
+# Applying a random under-sampler before the training of the linear model or
+# random forest, allows to not focus on the majority class at the cost of
+# making more mistake for samples in the majority class (i.e. decreased
+# accuracy).
 #
 # We could apply any type of samplers and find which sampler is working best
 # on the current dataset.
@@ -288,16 +291,17 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 # Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`
 # .......................................................................
 #
-# We already showed that random under-sampling can be effective on decision tree.
-# However, instead of under-sampling once the dataset, one could under-sample
-# the original dataset before to take a bootstrap sample. This is the base of
-# the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`.
+# We already showed that random under-sampling can be effective on decision
+# tree. However, instead of under-sampling once the dataset, one could
+# under-sample the original dataset before to take a bootstrap sample. This is
+# the base of the `BalancedRandomForestClassifier` and
+# `BalancedBaggingClassifier`.
 
 from imblearn.ensemble import BalancedRandomForestClassifier
 
 rf_clf = make_pipeline(
     preprocessor_tree,
-    BalancedRandomForestClassifier(random_state=42)
+    BalancedRandomForestClassifier(random_state=42, n_jobs=2)
 )
 
 df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF")
@@ -316,7 +320,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
     preprocessor_tree,
     BalancedBaggingClassifier(
         base_estimator=HistGradientBoostingClassifier(random_state=42),
-        n_estimators=10, random_state=42
+        n_estimators=10, random_state=42, n_jobs=2
     )
 )
 
@@ -330,8 +334,8 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 # to bring some diversity for the different GBDT to learn and not focus on a
 # portion of the majority class.
 #
-# We will repeat the same experiment but with a ratio of 100:1 and make a similar
-# analysis.
+# We will repeat the same experiment but with a ratio of 100:1 and make a
+# similar analysis.
 
 ###############################################################################
 # Increase imbalanced ratio
@@ -354,7 +358,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 )
 df_scores = evaluate_classifier(lr_clf, df_scores, "LR")
 rf_clf = make_pipeline(
-    preprocessor_tree, RandomForestClassifier(random_state=42)
+    preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2)
 )
 df_scores = evaluate_classifier(rf_clf, df_scores, "RF")
 lr_clf.set_params(logisticregression__class_weight="balanced")
@@ -376,14 +380,14 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 rf_clf = make_pipeline_with_sampler(
     preprocessor_tree,
     RandomUnderSampler(random_state=42),
-    RandomForestClassifier(random_state=42)
+    RandomForestClassifier(random_state=42, n_jobs=2)
 )
 df_scores = evaluate_classifier(
     rf_clf, df_scores, "RF with under-sampling"
 )
 rf_clf = make_pipeline(
     preprocessor_tree,
-    BalancedRandomForestClassifier(random_state=42)
+    BalancedRandomForestClassifier(random_state=42, n_jobs=2)
 )
 df_scores = evaluate_classifier(rf_clf, df_scores)
 df_scores = evaluate_classifier(

From 218241745a7cb0a6ceffe7a02dc19ece0d679a4f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Nov 2019 23:35:03 +0100
Subject: [PATCH 10/11] DOC add conclusion

---
 examples/applications/plot_impact_imbalanced_classes.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
index 5ce1fb14b..657fe105d 100644
--- a/examples/applications/plot_impact_imbalanced_classes.py
+++ b/examples/applications/plot_impact_imbalanced_classes.py
@@ -394,3 +394,10 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
     bag_clf, df_scores, "Balanced bagging"
 )
 df_scores
+
+###############################################################################
+# When we analyse the results, we can draw a similar conclusion than in the
+# previous discussion. However, we can observe that the strategy
+# `class_weight="balanced"` does not improve the performance. A resampling is
+# indeed required. The most effective method remains the
+# `BalancedBaggingClassifier` using a GBDT as a base learner.

From 8b11705670d990cb806c17b68fb6fa0897367194 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 19 Nov 2019 23:36:24 +0100
Subject: [PATCH 11/11] typos

---
 examples/applications/plot_impact_imbalanced_classes.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py
index 657fe105d..97fcb7131 100644
--- a/examples/applications/plot_impact_imbalanced_classes.py
+++ b/examples/applications/plot_impact_imbalanced_classes.py
@@ -396,8 +396,9 @@ def evaluate_classifier(clf, df_scores, clf_name=None):
 df_scores
 
 ###############################################################################
-# When we analyse the results, we can draw a similar conclusion than in the
+# When we analyse the results, we can draw similar conclusions than in the
 # previous discussion. However, we can observe that the strategy
-# `class_weight="balanced"` does not improve the performance. A resampling is
-# indeed required. The most effective method remains the
-# `BalancedBaggingClassifier` using a GBDT as a base learner.
+# `class_weight="balanced"` does not improve the performance when using a
+# `RandomForestClassifier`. A resampling is indeed required. The most effective
+# method remains the `BalancedBaggingClassifier` using a GBDT as a base
+# learner.