ENH allows pandas series in/out for the target (#647)

glemaitre · web-flow · commit f356284023d2 · 2019-11-18T00:13:57.000+01:00
diff --git a/doc/introduction.rst b/doc/introduction.rst
@@ -34,7 +34,7 @@ The output will be of the following type:
 
 * ``data_resampled``: array-like (2-D list, pandas.Dataframe, numpy.array) or
    sparse matrices;
-   * ``targets_resampled``: 1-D numpy.array.
+   * ``targets_resampled``: 1-D numpy.array or pd.Series.
 
 .. topic:: Sparse input
 
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
@@ -57,7 +57,8 @@ Enhancement
 - :class:`imblearn.under_sampling.RandomUnderSampling`,
   :class:`imblearn.over_sampling.RandomOverSampling`,
   :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and
-  will output Pandas DataFrame.
+  will output Pandas DataFrame. Similarly, it will accepts Pandas Series in and
+  will output Pandas Series.
   :pr:`636` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 - :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -80,20 +80,22 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        if self._columns is not None:
+        if self._X_columns is not None or self._y_name is not None:
             import pandas as pd
-            X_ = pd.DataFrame(output[0], columns=self._columns)
+
+        if self._X_columns is not None:
+            X_ = pd.DataFrame(output[0], columns=self._X_columns)
+            X_ = X_.astype(self._X_dtypes)
         else:
             X_ = output[0]
 
-        if binarize_y:
-            y_sampled = label_binarize(output[1], np.unique(y))
-            if len(output) == 2:
-                return X_, y_sampled
-            return X_, y_sampled, output[2]
-        if len(output) == 2:
-            return X_, output[1]
-        return X_, output[1], output[2]
+        y_ = (label_binarize(output[1], np.unique(y))
+              if binarize_y else output[1])
+
+        if self._y_name is not None:
+            y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
+
+        return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
 
     #  define an alias for back-compatibility
     fit_sample = fit_resample
@@ -135,8 +137,22 @@ def __init__(self, sampling_strategy="auto"):
         self.sampling_strategy = sampling_strategy
 
     def _check_X_y(self, X, y, accept_sparse=None):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         if accept_sparse is None:
             accept_sparse = ["csr", "csc"]
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
@@ -263,20 +279,24 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        if self._columns is not None:
-            import pandas as pd
-            X_ = pd.DataFrame(output[0], columns=self._columns)
-        else:
-            X_ = output[0]
+        if self.validate:
+            if self._X_columns is not None or self._y_name is not None:
+                import pandas as pd
 
-        if self.validate and binarize_y:
-            y_sampled = label_binarize(output[1], np.unique(y))
-            if len(output) == 2:
-                return X_, y_sampled
-            return X_, y_sampled, output[2]
-        if len(output) == 2:
-            return X_, output[1]
-        return X_, output[1], output[2]
+            if self._X_columns is not None:
+                X_ = pd.DataFrame(output[0], columns=self._X_columns)
+                X_ = X_.astype(self._X_dtypes)
+            else:
+                X_ = output[0]
+
+            y_ = (label_binarize(output[1], np.unique(y))
+                  if binarize_y else output[1])
+
+            if self._y_name is not None:
+                y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
+
+            return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
+        return output
 
     def _fit_resample(self, X, y):
         func = _identity if self.func is None else self.func
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -75,8 +75,22 @@ def __init__(self, sampling_strategy="auto", random_state=None):
         self.random_state = random_state
 
     def _check_X_y(self, X, y):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py
@@ -892,8 +892,22 @@ def _check_X_y(self, X, y):
         """Overwrite the checking to let pass some string for categorical
         features.
         """
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
         return X, y, binarize_y
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -81,8 +81,22 @@ def __init__(
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -242,6 +242,7 @@ def check_samplers_pandas(name, Sampler):
         random_state=0,
     )
     X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
+    y_pd = pd.Series(y, name="class")
     sampler = Sampler()
     if isinstance(Sampler(), NearMiss):
         samplers = [Sampler(version=version) for version in (1, 2, 3)]
@@ -251,14 +252,16 @@ def check_samplers_pandas(name, Sampler):
 
     for sampler in samplers:
         set_random_state(sampler)
-        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y)
+        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
         X_res, y_res = sampler.fit_resample(X, y)
 
         # check that we return a pandas dataframe if a dataframe was given in
         assert isinstance(X_res_pd, pd.DataFrame)
+        assert isinstance(y_res_pd, pd.Series)
         assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
+        assert y_pd.name == y_res_pd.name
         assert_allclose(X_res_pd.to_numpy(), X_res)
-        assert_allclose(y_res_pd, y_res)
+        assert_allclose(y_res_pd.to_numpy(), y_res)
 
 
 def check_samplers_multiclass_ova(name, Sampler):