From 77d3dec65bc08c3e2e95e5b166063882af7f0bcd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 17 Nov 2019 23:32:53 +0100 Subject: [PATCH 01/11] ENH Allows pandas series in/out --- doc/introduction.rst | 2 +- doc/whats_new/v0.6.rst | 3 +- imblearn/base.py | 71 ++++++++++++++----- .../over_sampling/_random_over_sampler.py | 18 ++++- .../_random_under_sampler.py | 18 ++++- imblearn/utils/estimator_checks.py | 7 +- 6 files changed, 92 insertions(+), 27 deletions(-) diff --git a/doc/introduction.rst b/doc/introduction.rst index 1ea72326f..a4184abb2 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -34,7 +34,7 @@ The output will be of the following type: * ``data_resampled``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse matrices; - * ``targets_resampled``: 1-D numpy.array. + * ``targets_resampled``: 1-D numpy.array or pd.Series. .. topic:: Sparse input diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst index a130fb301..223f68551 100644 --- a/doc/whats_new/v0.6.rst +++ b/doc/whats_new/v0.6.rst @@ -57,7 +57,8 @@ Enhancement - :class:`imblearn.under_sampling.RandomUnderSampling`, :class:`imblearn.over_sampling.RandomOverSampling`, :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and - will output Pandas DataFrame. + will output Pandas DataFrame. Similarly, it will accepts Pandas Series in and + will output Pandas Series. :pr:`636` by :user:`Guillaume Lemaitre `. - :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing diff --git a/imblearn/base.py b/imblearn/base.py index b182f3e72..5ae136c39 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -80,20 +80,28 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) - if self._columns is not None: + if self._X_columns is not None or self._y_name is not None: import pandas as pd - X_ = pd.DataFrame(output[0], columns=self._columns) + + if self._X_columns is not None: + X_ = pd.DataFrame(output[0], columns=self._X_columns) + X_ = X_.astype(self._X_dtypes) else: X_ = output[0] + y_ = (label_binarize(output[1], np.unique(y)) + if binarize_y else output[1]) + + if self._y_name is not None: + y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) + if binarize_y: - y_sampled = label_binarize(output[1], np.unique(y)) if len(output) == 2: - return X_, y_sampled - return X_, y_sampled, output[2] + return X_, y_ + return X_, y_, output[2] if len(output) == 2: - return X_, output[1] - return X_, output[1], output[2] + return X_, y_ + return X_, y_, output[2] # define an alias for back-compatibility fit_sample = fit_resample @@ -135,8 +143,22 @@ def __init__(self, sampling_strategy="auto"): self.sampling_strategy = sampling_strategy def _check_X_y(self, X, y, accept_sparse=None): - # store the columns name to reconstruct a dataframe - self._columns = X.columns if hasattr(X, "loc") else None + if hasattr(X, "loc"): + # store information to build dataframe + self._X_columns = X.columns + self._X_dtypes = X.dtypes + else: + self._X_columns = None + self._X_dtypes = None + + if hasattr(y, "loc"): + # store information to build a series + self._y_name = y.name + self._y_dtype = y.dtype + else: + self._y_name = None + self._y_dtype = None + if accept_sparse is None: accept_sparse = ["csr", "csc"] y, binarize_y = check_target_type(y, indicate_one_vs_all=True) @@ -263,20 +285,31 @@ def fit_resample(self, X, y): output = self._fit_resample(X, y) - if self._columns is not None: - import pandas as pd - X_ = pd.DataFrame(output[0], columns=self._columns) + if self.validate: + if self._X_columns is not None or self._y_name is not None: + import pandas as pd + + if self._X_columns is not None: + X_ = pd.DataFrame(output[0], columns=self._X_columns) + X_ = X_.astype(self._X_dtypes) + else: + X_ = output[0] + + y_ = (label_binarize(output[1], np.unique(y)) + if binarize_y else output[1]) + + if self._y_name is not None: + y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) else: - X_ = output[0] + X_, y_ = output[0], output[1] - if self.validate and binarize_y: - y_sampled = label_binarize(output[1], np.unique(y)) + if binarize_y: if len(output) == 2: - return X_, y_sampled - return X_, y_sampled, output[2] + return X_, y_ + return X_, y_, output[2] if len(output) == 2: - return X_, output[1] - return X_, output[1], output[2] + return X_, y_ + return X_, y_, output[2] def _fit_resample(self, X, y): func = _identity if self.func is None else self.func diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index ea8b4d18b..afcb89da5 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -75,8 +75,22 @@ def __init__(self, sampling_strategy="auto", random_state=None): self.random_state = random_state def _check_X_y(self, X, y): - # store the columns name to reconstruct a dataframe - self._columns = X.columns if hasattr(X, "loc") else None + if hasattr(X, "loc"): + # store information to build dataframe + self._X_columns = X.columns + self._X_dtypes = X.dtypes + else: + self._X_columns = None + self._X_dtypes = None + + if hasattr(y, "loc"): + # store information to build a series + self._y_name = y.name + self._y_dtype = y.dtype + else: + self._y_name = None + self._y_dtype = None + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 6301822ea..8d7c08c93 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -81,8 +81,22 @@ def __init__( self.replacement = replacement def _check_X_y(self, X, y): - # store the columns name to reconstruct a dataframe - self._columns = X.columns if hasattr(X, "loc") else None + if hasattr(X, "loc"): + # store information to build dataframe + self._X_columns = X.columns + self._X_dtypes = X.dtypes + else: + self._X_columns = None + self._X_dtypes = None + + if hasattr(y, "loc"): + # store information to build a series + self._y_name = y.name + self._y_dtype = y.dtype + else: + self._y_name = None + self._y_dtype = None + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, force_all_finite=False) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 4fef2a13b..43f117ba3 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -242,6 +242,7 @@ def check_samplers_pandas(name, Sampler): random_state=0, ) X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])]) + y_pd = pd.Series(y, name="class") sampler = Sampler() if isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] @@ -251,14 +252,16 @@ def check_samplers_pandas(name, Sampler): for sampler in samplers: set_random_state(sampler) - X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y) + X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd) X_res, y_res = sampler.fit_resample(X, y) # check that we return a pandas dataframe if a dataframe was given in assert isinstance(X_res_pd, pd.DataFrame) + assert isinstance(y_res_pd, pd.Series) assert X_pd.columns.to_list() == X_res_pd.columns.to_list() + assert y_pd.name == y_res_pd.name assert_allclose(X_res_pd.to_numpy(), X_res) - assert_allclose(y_res_pd, y_res) + assert_allclose(y_res_pd.to_numpy(), y_res) def check_samplers_multiclass_ova(name, Sampler): From b63e7ce99867c8d3660da4a5f2b1141bdf3a990f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 17 Nov 2019 23:34:35 +0100 Subject: [PATCH 02/11] PEP8 --- imblearn/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/base.py b/imblearn/base.py index 5ae136c39..43934b880 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -296,7 +296,7 @@ def fit_resample(self, X, y): X_ = output[0] y_ = (label_binarize(output[1], np.unique(y)) - if binarize_y else output[1]) + if binarize_y else output[1]) if self._y_name is not None: y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) From c4777f5556a7533dd213c98a84de4c78a114a076 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 17 Nov 2019 23:46:34 +0100 Subject: [PATCH 03/11] fix --- imblearn/base.py | 10 +++++----- imblearn/over_sampling/_smote.py | 18 ++++++++++++++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 43934b880..e86052715 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -300,16 +300,16 @@ def fit_resample(self, X, y): if self._y_name is not None: y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) + + if binarize_y: + if len(output) == 2: + return X_, y_ + return X_, y_, output[2] else: X_, y_ = output[0], output[1] - - if binarize_y: if len(output) == 2: return X_, y_ return X_, y_, output[2] - if len(output) == 2: - return X_, y_ - return X_, y_, output[2] def _fit_resample(self, X, y): func = _identity if self.func is None else self.func diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index e109ebe4c..b764da6b6 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -892,8 +892,22 @@ def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical features. """ - # store the columns name to reconstruct a dataframe - self._columns = X.columns if hasattr(X, "loc") else None + if hasattr(X, "loc"): + # store information to build dataframe + self._X_columns = X.columns + self._X_dtypes = X.dtypes + else: + self._X_columns = None + self._X_dtypes = None + + if hasattr(y, "loc"): + # store information to build a series + self._y_name = y.name + self._y_dtype = y.dtype + else: + self._y_name = None + self._y_dtype = None + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None) return X, y, binarize_y From 1b4aa9c144e7be2c6dc3219f50af19d444be4049 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Nov 2019 00:06:11 +0100 Subject: [PATCH 04/11] iter --- imblearn/base.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index e86052715..c5d6b0185 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -95,13 +95,7 @@ def fit_resample(self, X, y): if self._y_name is not None: y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - if binarize_y: - if len(output) == 2: - return X_, y_ - return X_, y_, output[2] - if len(output) == 2: - return X_, y_ - return X_, y_, output[2] + return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) # define an alias for back-compatibility fit_sample = fit_resample @@ -301,15 +295,8 @@ def fit_resample(self, X, y): if self._y_name is not None: y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name) - if binarize_y: - if len(output) == 2: - return X_, y_ - return X_, y_, output[2] - else: - X_, y_ = output[0], output[1] - if len(output) == 2: - return X_, y_ - return X_, y_, output[2] + return (X_, y_) if len(output) == 2 else (X_, y_, output[2]) + return output def _fit_resample(self, X, y): func = _identity if self.func is None else self.func From 96ef7d45f9c4d7679f10a0da8b8cab6b0200d28b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 18 Nov 2019 00:10:48 +0100 Subject: [PATCH 05/11] DOC effect and comparison to deal with imbalanced classification --- .../plot_impact_imbalanced_classes.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 examples/applications/plot_impact_imbalanced_classes.py diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py new file mode 100644 index 000000000..267c5a09e --- /dev/null +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -0,0 +1,82 @@ +""" +======================================================================== +Model fitting on imbalanced dataset and comparison of methods to improve +performance +======================================================================== + +This example illustrates the problem induced by learning on datasets having +imbalanced classes. Subsequently, we compare different approaches alleviating +these negative effects. + +""" + +# Authors: Guillaume Lemaitre +# License: MIT + +print(__doc__) + +############################################################################### +# Problem definition +############################################################################### + +from sklearn.datasets import fetch_openml + +df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True) +# we are dropping the following features: +# - "fnlwgt": this feature was created while studying the "adult" dataset. +# Thus, we will not use this feature which is not acquired during the survey. +# - "education-num": it is encoding the same information than "education". +# Thus, we are removing one of these 2 features. +df = df.drop(columns=['fnlwgt', 'education-num']) + +############################################################################### +# The "adult" dataset as a class ratio of about 3:1 + +from collections import Counter + +classes_count = y.value_counts() +print(f"Classes information:\n{classes_count}") + +############################################################################### +# This dataset is only slightly imbalanced. To better highlight the effect of +# learning from imbalanced dataset, we will increase this ratio to 30:1 + +from imblearn.datasets import make_imbalance + +ratio = 30 +df_res, y_res = make_imbalance( + df, y, sampling_strategy={ + classes_count.idxmin(): classes_count.max() // ratio + } +) + +############################################################################### +# For the rest of the notebook, we will make a single split to get training +# and testing data. Note that you should use cross-validation to have an +# estimate of the performance variation in practice. + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split( + df_res, y_res, stratify=y_res, random_state=42 +) + +############################################################################### +# As a baseline, we could use a classifier which will always predict the +# majority class independently of the features provided. + +from sklearn.dummy import DummyClassifier + +dummy_clf = DummyClassifier(strategy="most_frequent") +score = dummy_clf.fit(X_train, y_train).score(X_test, y_test) +print(f"Accuracy score of a dummy classifier: {score:.3f}") + +############################################################################## +# Instead of using the accuracy, we can use the balanced accuracy which will +# take into account the balancing issue. + +from sklearn.metrics import balanced_accuracy_score + +y_pred = dummy_clf.predict(X_test) +score = balanced_accuracy_score(y_test, y_pred) +print(f"Balanced accuracy score of a dummy classifier: {score:.3f}") From 2940dbbfe46a088343b948492d7c99b25dd7ef70 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Nov 2019 01:37:43 +0100 Subject: [PATCH 06/11] iter --- .../plot_impact_imbalanced_classes.py | 314 +++++++++++++++++- 1 file changed, 313 insertions(+), 1 deletion(-) diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py index 267c5a09e..9b72139ee 100644 --- a/examples/applications/plot_impact_imbalanced_classes.py +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -35,7 +35,7 @@ from collections import Counter classes_count = y.value_counts() -print(f"Classes information:\n{classes_count}") +classes_count ############################################################################### # This dataset is only slightly imbalanced. To better highlight the effect of @@ -49,6 +49,7 @@ classes_count.idxmin(): classes_count.max() // ratio } ) +y_res.value_counts() ############################################################################### # For the rest of the notebook, we will make a single split to get training @@ -80,3 +81,314 @@ y_pred = dummy_clf.predict(X_test) score = balanced_accuracy_score(y_test, y_pred) print(f"Balanced accuracy score of a dummy classifier: {score:.3f}") + +############################################################################### +# Strategies to learn from an imbalanced dataset +############################################################################### + +############################################################################### +# We will first define an helper function which will train a given model +# and compute both accuracy and balanced accuracy. The results will be stored +# in a dataframe + +import pandas as pd + + +def evaluate_classifier(clf, df_scores, clf_name=None): + from sklearn.pipeline import Pipeline + if clf_name is None: + if isinstance(clf, Pipeline): + clf_name = clf[-1].__class__.__name__ + else: + clf_name = clf.__class__.__name__ + acc = clf.fit(X_train, y_train).score(X_test, y_test) + y_pred = clf.predict(X_test) + bal_acc = balanced_accuracy_score(y_test, y_pred) + clf_score = pd.DataFrame( + {clf_name: [acc, bal_acc]}, + index=['Accuracy', 'Balanced accuracy'] + ) + df_scores = pd.concat([df_scores, clf_score], axis=1).round(decimals=3) + return df_scores + + +# Let's define an empty dataframe to store the results +df_scores = pd.DataFrame() + +############################################################################### +# Dummy baseline +# .............. +# +# Before to train a real machine learning model, we can store the results +# obtained with our `DummyClassifier`. + +df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy") +df_scores + +############################################################################### +# Linear classifier baseline +# .......................... +# +# We will create a machine learning pipeline using a `LogisticRegression` +# classifier. In this regard, we will need to one-hot encode the categorical +# columns and standardized the numerical columns before to inject the data into +# the `LogisticRegression` classifier. +# +# First, we define our numerical and categorical pipelines. + +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import OneHotEncoder +from sklearn.pipeline import make_pipeline + +num_pipe = make_pipeline( + StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True) +) +cat_pipe = make_pipeline( + SimpleImputer(strategy="constant", fill_value="missing"), + OneHotEncoder(handle_unknown="ignore") +) + +############################################################################### +# Then, we can create a preprocessor which will dispatch the categorical +# columns to the categorical pipeline and the numerical columns to the +# numerical pipeline + +import numpy as np +from sklearn.compose import ColumnTransformer +from sklearn.compose import make_column_selector as selector + +preprocessor_linear = ColumnTransformer( + [("num-pipe", num_pipe, selector(dtype_include=np.number)), + ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))] +) + +############################################################################### +# Finally, we connect our preprocessor with our `LogisticRegression`. We can +# then evaluate our model. + +from sklearn.linear_model import LogisticRegression + +lr_clf = make_pipeline( + preprocessor_linear, LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier(lr_clf, df_scores, "LR") +df_scores + +############################################################################### +# We can see that our linear model is learning slightly better than our dummy +# baseline. However, it is impacted by class imbalanced. +# +# We can verify that something similar is happening with a tree-based model +# such as `RandomForestClassifier`. With this type of classifier, we will not +# need to scale the numerical data, and we will only need to ordinal encode the +# categorical data. + +from sklearn.preprocessing import OrdinalEncoder +from sklearn.ensemble import RandomForestClassifier + +cat_pipe = make_pipeline( + SimpleImputer(strategy="constant", fill_value="missing"), + OrdinalEncoder() +) + +preprocessor_tree = ColumnTransformer( + [("num-pipe", num_pipe, selector(dtype_include=np.number)), + ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))] +) + +rf_clf = make_pipeline( + preprocessor_tree, RandomForestClassifier(random_state=42) +) + +df_scores = evaluate_classifier(rf_clf, df_scores, "RF") +df_scores + +############################################################################### +# The `RandomForestClassifier` is as well affected by the class imbalanced, +# slightly less than the linear model. Now, we will present different approach +# to improve the performance of these 2 models. +# +# Use `class_weight` +# .................. +# +# Most of the models in `scikit-learn` have a parameter `class_weight`. This +# parameter will affect the computation of the loss in linear model or the +# criterion in the tree-based model to penalize differently a false +# classification from the minority and majority class. We can set +# `class_weight="balanced"` such that the weight applied is inversely +# proportional to the class frequency. We test this parametrization in both +# linear model and tree-based model. + +lr_clf.set_params(logisticregression__class_weight="balanced") +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with class weight" +) +df_scores + +############################################################################### +# + +rf_clf.set_params(randomforestclassifier__class_weight="balanced") +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with class weight" +) +df_scores + +############################################################################### +# We can see that using `class_weight` was really effective for the linear +# model, alleviating the issue of learning from imbalanced classes. However, +# the `RandomForestClassifier` is still biased toward the majority class, +# mainly due to the criterion which is not suited enough to fight the class +# imbalance. +# +# Resample the training set during learning +# ......................................... +# +# Another way is to resample the training set by under-sampling or +# over-sampling some of the samples. `imbalanced-learn` provides some samplers +# to do such precessing. + +from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler +from imblearn.under_sampling import RandomUnderSampler + +lr_clf = make_pipeline_with_sampler( + preprocessor_linear, + RandomUnderSampler(random_state=42), + LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with under-sampling" +) +df_scores + +############################################################################### +# + +rf_clf = make_pipeline_with_sampler( + preprocessor_tree, + RandomUnderSampler(random_state=42), + RandomForestClassifier(random_state=42) +) + +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with under-sampling" +) +df_scores + +############################################################################### +# Applying a random under-sampler before to train the linear model or random +# forest, allows to not focus on the majority class at the cost of making more +# mistake for samples in the majority class (i.e. decreased accuracy). +# +# We could apply any type of samplers and find which sampler is working best +# on the current dataset. +# +# Instead, we will present another way by using classifiers which will apply +# sampling internally. +# +# Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier` +# ....................................................................... +# +# We already show that random under-sampling can be effective on decision tree. +# However, instead of under-sampling once the dataset, one could under-sample +# the original dataset before to take a bootstrap sample. This is the base of +# the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`. + +from imblearn.ensemble import BalancedRandomForestClassifier + +rf_clf = make_pipeline( + preprocessor_tree, + BalancedRandomForestClassifier(random_state=42) +) + +df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF") +df_scores + +############################################################################### +# The performance with the `BalancedRandomForestClassifier` are better than +# applying a single random under-sampling. We will use a gradient-boosting +# classifier within a `BalancedBaggingClassifier`. + +from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import HistGradientBoostingClassifier +from imblearn.ensemble import BalancedBaggingClassifier + +bag_clf = make_pipeline( + preprocessor_tree, + BalancedBaggingClassifier( + base_estimator=HistGradientBoostingClassifier(random_state=42), + n_estimators=10, random_state=42 + ) +) + +df_scores = evaluate_classifier( + bag_clf, df_scores, "Balanced bagging" +) +df_scores + +############################################################################### +# This last approach is the most effective. The different under-sampling allows +# to bring some diversity for the different GBDT to learn and not focus on a +# portion of the majority class. +# +# We will repeat the same experiment but a ratio of 100:1 and make a similar +# analysis. + +############################################################################### +# Increase imbalanced ratio +############################################################################### + +ratio = 100 +df_res, y_res = make_imbalance( + df, y, sampling_strategy={ + classes_count.idxmin(): classes_count.max() // ratio + } +) +X_train, X_test, y_train, y_test = train_test_split( + df_res, y_res, stratify=y_res, random_state=42 +) + +df_scores = pd.DataFrame() +df_scores = evaluate_classifier(dummy_clf, df_scores, "Dummy") +lr_clf = make_pipeline( + preprocessor_linear, LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier(lr_clf, df_scores, "LR") +rf_clf = make_pipeline( + preprocessor_tree, RandomForestClassifier(random_state=42) +) +df_scores = evaluate_classifier(rf_clf, df_scores, "RF") +lr_clf.set_params(logisticregression__class_weight="balanced") +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with class weight" +) +rf_clf.set_params(randomforestclassifier__class_weight="balanced") +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with class weight" +) +lr_clf = make_pipeline_with_sampler( + preprocessor_linear, + RandomUnderSampler(random_state=42), + LogisticRegression(max_iter=1000) +) +df_scores = evaluate_classifier( + lr_clf, df_scores, "LR with under-sampling" +) +rf_clf = make_pipeline_with_sampler( + preprocessor_tree, + RandomUnderSampler(random_state=42), + RandomForestClassifier(random_state=42) +) +df_scores = evaluate_classifier( + rf_clf, df_scores, "RF with under-sampling" +) +rf_clf = make_pipeline( + preprocessor_tree, + BalancedRandomForestClassifier(random_state=42) +) +df_scores = evaluate_classifier(rf_clf, df_scores) +df_scores = evaluate_classifier( + bag_clf, df_scores, "Balanced bagging" +) +df_scores From 4d0a62982d3bb3865c1e6e5c98302858fb3019b3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Nov 2019 16:22:50 +0100 Subject: [PATCH 07/11] Apply suggestions from code review Co-Authored-By: Christos Aridas --- .../plot_impact_imbalanced_classes.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py index 9b72139ee..4fa22f5cc 100644 --- a/examples/applications/plot_impact_imbalanced_classes.py +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -1,7 +1,7 @@ """ ======================================================================== Model fitting on imbalanced dataset and comparison of methods to improve -performance +its performance ======================================================================== This example illustrates the problem induced by learning on datasets having @@ -39,7 +39,7 @@ ############################################################################### # This dataset is only slightly imbalanced. To better highlight the effect of -# learning from imbalanced dataset, we will increase this ratio to 30:1 +# learning from an imbalanced dataset, we will increase its ratio to 30:1 from imblearn.datasets import make_imbalance @@ -87,7 +87,7 @@ ############################################################################### ############################################################################### -# We will first define an helper function which will train a given model +# We will first define a helper function which will train a given model # and compute both accuracy and balanced accuracy. The results will be stored # in a dataframe @@ -177,7 +177,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): ############################################################################### # We can see that our linear model is learning slightly better than our dummy -# baseline. However, it is impacted by class imbalanced. +# baseline. However, it is impacted by the class imbalance. # # We can verify that something similar is happening with a tree-based model # such as `RandomForestClassifier`. With this type of classifier, we will not @@ -247,7 +247,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): # # Another way is to resample the training set by under-sampling or # over-sampling some of the samples. `imbalanced-learn` provides some samplers -# to do such precessing. +# to do such processing. from imblearn.pipeline import make_pipeline as make_pipeline_with_sampler from imblearn.under_sampling import RandomUnderSampler @@ -277,7 +277,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): df_scores ############################################################################### -# Applying a random under-sampler before to train the linear model or random +# Applying a random under-sampler before the training of the linear model or random # forest, allows to not focus on the majority class at the cost of making more # mistake for samples in the majority class (i.e. decreased accuracy). # @@ -290,7 +290,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): # Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier` # ....................................................................... # -# We already show that random under-sampling can be effective on decision tree. +# We already showed that random under-sampling can be effective on decision tree. # However, instead of under-sampling once the dataset, one could under-sample # the original dataset before to take a bootstrap sample. This is the base of # the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`. @@ -306,7 +306,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): df_scores ############################################################################### -# The performance with the `BalancedRandomForestClassifier` are better than +# The performance with the `BalancedRandomForestClassifier` is better than # applying a single random under-sampling. We will use a gradient-boosting # classifier within a `BalancedBaggingClassifier`. @@ -332,7 +332,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): # to bring some diversity for the different GBDT to learn and not focus on a # portion of the majority class. # -# We will repeat the same experiment but a ratio of 100:1 and make a similar +# We will repeat the same experiment but with a ratio of 100:1 and make a similar # analysis. ############################################################################### From db914bf066ed525af7b9a4ebcc66e0325da47f5d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Nov 2019 16:41:32 +0100 Subject: [PATCH 08/11] Update plot_impact_imbalanced_classes.py --- examples/applications/plot_impact_imbalanced_classes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py index 4fa22f5cc..cd4b92e62 100644 --- a/examples/applications/plot_impact_imbalanced_classes.py +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -32,8 +32,6 @@ ############################################################################### # The "adult" dataset as a class ratio of about 3:1 -from collections import Counter - classes_count = y.value_counts() classes_count From 802cc446f3c71f94a4e7041d33a7ae00b402b08c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Nov 2019 22:49:00 +0100 Subject: [PATCH 09/11] increase n_jobs --- .../plot_impact_imbalanced_classes.py | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py index cd4b92e62..5ce1fb14b 100644 --- a/examples/applications/plot_impact_imbalanced_classes.py +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -158,7 +158,8 @@ def evaluate_classifier(clf, df_scores, clf_name=None): preprocessor_linear = ColumnTransformer( [("num-pipe", num_pipe, selector(dtype_include=np.number)), - ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))] + ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))], + n_jobs=2 ) ############################################################################### @@ -192,11 +193,12 @@ def evaluate_classifier(clf, df_scores, clf_name=None): preprocessor_tree = ColumnTransformer( [("num-pipe", num_pipe, selector(dtype_include=np.number)), - ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))] + ("cat-pipe", cat_pipe, selector(dtype_include=pd.CategoricalDtype))], + n_jobs=2 ) rf_clf = make_pipeline( - preprocessor_tree, RandomForestClassifier(random_state=42) + preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores, "RF") @@ -266,7 +268,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): rf_clf = make_pipeline_with_sampler( preprocessor_tree, RandomUnderSampler(random_state=42), - RandomForestClassifier(random_state=42) + RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier( @@ -275,9 +277,10 @@ def evaluate_classifier(clf, df_scores, clf_name=None): df_scores ############################################################################### -# Applying a random under-sampler before the training of the linear model or random -# forest, allows to not focus on the majority class at the cost of making more -# mistake for samples in the majority class (i.e. decreased accuracy). +# Applying a random under-sampler before the training of the linear model or +# random forest, allows to not focus on the majority class at the cost of +# making more mistake for samples in the majority class (i.e. decreased +# accuracy). # # We could apply any type of samplers and find which sampler is working best # on the current dataset. @@ -288,16 +291,17 @@ def evaluate_classifier(clf, df_scores, clf_name=None): # Use of `BalancedRandomForestClassifier` and `BalancedBaggingClassifier` # ....................................................................... # -# We already showed that random under-sampling can be effective on decision tree. -# However, instead of under-sampling once the dataset, one could under-sample -# the original dataset before to take a bootstrap sample. This is the base of -# the `BalancedRandomForestClassifier` and `BalancedBaggingClassifier`. +# We already showed that random under-sampling can be effective on decision +# tree. However, instead of under-sampling once the dataset, one could +# under-sample the original dataset before to take a bootstrap sample. This is +# the base of the `BalancedRandomForestClassifier` and +# `BalancedBaggingClassifier`. from imblearn.ensemble import BalancedRandomForestClassifier rf_clf = make_pipeline( preprocessor_tree, - BalancedRandomForestClassifier(random_state=42) + BalancedRandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores, "Balanced RF") @@ -316,7 +320,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): preprocessor_tree, BalancedBaggingClassifier( base_estimator=HistGradientBoostingClassifier(random_state=42), - n_estimators=10, random_state=42 + n_estimators=10, random_state=42, n_jobs=2 ) ) @@ -330,8 +334,8 @@ def evaluate_classifier(clf, df_scores, clf_name=None): # to bring some diversity for the different GBDT to learn and not focus on a # portion of the majority class. # -# We will repeat the same experiment but with a ratio of 100:1 and make a similar -# analysis. +# We will repeat the same experiment but with a ratio of 100:1 and make a +# similar analysis. ############################################################################### # Increase imbalanced ratio @@ -354,7 +358,7 @@ def evaluate_classifier(clf, df_scores, clf_name=None): ) df_scores = evaluate_classifier(lr_clf, df_scores, "LR") rf_clf = make_pipeline( - preprocessor_tree, RandomForestClassifier(random_state=42) + preprocessor_tree, RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores, "RF") lr_clf.set_params(logisticregression__class_weight="balanced") @@ -376,14 +380,14 @@ def evaluate_classifier(clf, df_scores, clf_name=None): rf_clf = make_pipeline_with_sampler( preprocessor_tree, RandomUnderSampler(random_state=42), - RandomForestClassifier(random_state=42) + RandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier( rf_clf, df_scores, "RF with under-sampling" ) rf_clf = make_pipeline( preprocessor_tree, - BalancedRandomForestClassifier(random_state=42) + BalancedRandomForestClassifier(random_state=42, n_jobs=2) ) df_scores = evaluate_classifier(rf_clf, df_scores) df_scores = evaluate_classifier( From 218241745a7cb0a6ceffe7a02dc19ece0d679a4f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Nov 2019 23:35:03 +0100 Subject: [PATCH 10/11] DOC add conclusion --- examples/applications/plot_impact_imbalanced_classes.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py index 5ce1fb14b..657fe105d 100644 --- a/examples/applications/plot_impact_imbalanced_classes.py +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -394,3 +394,10 @@ def evaluate_classifier(clf, df_scores, clf_name=None): bag_clf, df_scores, "Balanced bagging" ) df_scores + +############################################################################### +# When we analyse the results, we can draw a similar conclusion than in the +# previous discussion. However, we can observe that the strategy +# `class_weight="balanced"` does not improve the performance. A resampling is +# indeed required. The most effective method remains the +# `BalancedBaggingClassifier` using a GBDT as a base learner. From 8b11705670d990cb806c17b68fb6fa0897367194 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 19 Nov 2019 23:36:24 +0100 Subject: [PATCH 11/11] typos --- examples/applications/plot_impact_imbalanced_classes.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/applications/plot_impact_imbalanced_classes.py b/examples/applications/plot_impact_imbalanced_classes.py index 657fe105d..97fcb7131 100644 --- a/examples/applications/plot_impact_imbalanced_classes.py +++ b/examples/applications/plot_impact_imbalanced_classes.py @@ -396,8 +396,9 @@ def evaluate_classifier(clf, df_scores, clf_name=None): df_scores ############################################################################### -# When we analyse the results, we can draw a similar conclusion than in the +# When we analyse the results, we can draw similar conclusions than in the # previous discussion. However, we can observe that the strategy -# `class_weight="balanced"` does not improve the performance. A resampling is -# indeed required. The most effective method remains the -# `BalancedBaggingClassifier` using a GBDT as a base learner. +# `class_weight="balanced"` does not improve the performance when using a +# `RandomForestClassifier`. A resampling is indeed required. The most effective +# method remains the `BalancedBaggingClassifier` using a GBDT as a base +# learner.