EXA: improve FunctionTransformer example

glemaitre · glemaitre · commit fe51ca3fe020 · 2018-08-30T11:28:50.000+02:00
diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py
@@ -30,14 +30,21 @@
 
 
 def plot_scatter(X, y, title):
+    """Function to plot some data as a scatter plot."""
     plt.figure()
     plt.scatter(X[y == 1, 0], X[y == 1, 1], label='Class #1')
     plt.scatter(X[y == 0, 0], X[y == 0, 1], label='Class #0')
     plt.legend()
     plt.title(title)
 
+##############################################################################
+# Toy data generation
+##############################################################################
+
+##############################################################################
+# We are generating some non Gaussian data set contaminated with some unform
+# noise.
 
-# Generate contaminated training data
 moons, _ = make_moons(n_samples=500, noise=0.05)
 blobs, _ = make_blobs(n_samples=500, centers=[(-0.75, 2.25),
                                               (1.0, 2.0)],
@@ -51,7 +58,9 @@ def plot_scatter(X, y, title):
 
 plot_scatter(X_train, y_train, 'Training dataset')
 
-# Generate non-contaminated testing data
+##############################################################################
+# We will generate some cleaned test data without outliers.
+
 moons, _ = make_moons(n_samples=50, noise=0.05)
 blobs, _ = make_blobs(n_samples=50, centers=[(-0.75, 2.25),
                                              (1.0, 2.0)],
@@ -62,8 +71,19 @@ def plot_scatter(X, y, title):
 
 plot_scatter(X_test, y_test, 'Testing dataset')
 
+##############################################################################
+# How to use the :class:`imblearn.FunctionSampler`
+##############################################################################
+
+##############################################################################
+# We first define a function which will use
+# :class:`sklearn.ensemble.IsolationForest` to eliminate some outliers from
+# our dataset during training. The function passed to the
+# :class:`imblearn.FunctionSampler` will be called when using the method
+# ``fit_resample``.
 
 def outlier_rejection(X, y):
+    """This will be our function used to resample our dataset."""
     model = IsolationForest(max_samples=100,
                             contamination=0.4,
                             random_state=rng)
@@ -76,6 +96,14 @@ def outlier_rejection(X, y):
 X_inliers, y_inliers = reject_sampler.fit_resample(X_train, y_train)
 plot_scatter(X_inliers, y_inliers, 'Training data without outliers')
 
+##############################################################################
+# Integrate it within a pipeline
+##############################################################################
+
+##############################################################################
+# By elimnating outliers before the training, the classifier will be less
+# affected during the prediction.
+
 pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
                      LogisticRegression(random_state=rng))
 y_pred = pipe.fit(X_train, y_train).predict(X_test)