ENH Allows pandas series in/out

glemaitre · glemaitre · commit 77d3dec65bc0 · 2019-11-17T23:32:53.000+01:00
diff --git a/doc/introduction.rst b/doc/introduction.rst
@@ -34,7 +34,7 @@ The output will be of the following type:
 
 * ``data_resampled``: array-like (2-D list, pandas.Dataframe, numpy.array) or
    sparse matrices;
-   * ``targets_resampled``: 1-D numpy.array.
+   * ``targets_resampled``: 1-D numpy.array or pd.Series.
 
 .. topic:: Sparse input
 
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
@@ -57,7 +57,8 @@ Enhancement
 - :class:`imblearn.under_sampling.RandomUnderSampling`,
   :class:`imblearn.over_sampling.RandomOverSampling`,
   :class:`imblearn.datasets.make_imbalance` accepts Pandas DataFrame in and
-  will output Pandas DataFrame.
+  will output Pandas DataFrame. Similarly, it will accepts Pandas Series in and
+  will output Pandas Series.
   :pr:`636` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 - :class:`imblearn.FunctionSampler` accepts a parameter ``validate`` allowing
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -80,20 +80,28 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        if self._columns is not None:
+        if self._X_columns is not None or self._y_name is not None:
             import pandas as pd
-            X_ = pd.DataFrame(output[0], columns=self._columns)
+
+        if self._X_columns is not None:
+            X_ = pd.DataFrame(output[0], columns=self._X_columns)
+            X_ = X_.astype(self._X_dtypes)
         else:
             X_ = output[0]
 
+        y_ = (label_binarize(output[1], np.unique(y))
+              if binarize_y else output[1])
+
+        if self._y_name is not None:
+            y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
+
         if binarize_y:
-            y_sampled = label_binarize(output[1], np.unique(y))
             if len(output) == 2:
-                return X_, y_sampled
-            return X_, y_sampled, output[2]
+                return X_, y_
+            return X_, y_, output[2]
         if len(output) == 2:
-            return X_, output[1]
-        return X_, output[1], output[2]
+            return X_, y_
+        return X_, y_, output[2]
 
     #  define an alias for back-compatibility
     fit_sample = fit_resample
@@ -135,8 +143,22 @@ def __init__(self, sampling_strategy="auto"):
         self.sampling_strategy = sampling_strategy
 
     def _check_X_y(self, X, y, accept_sparse=None):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         if accept_sparse is None:
             accept_sparse = ["csr", "csc"]
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
@@ -263,20 +285,31 @@ def fit_resample(self, X, y):
 
         output = self._fit_resample(X, y)
 
-        if self._columns is not None:
-            import pandas as pd
-            X_ = pd.DataFrame(output[0], columns=self._columns)
+        if self.validate:
+            if self._X_columns is not None or self._y_name is not None:
+                import pandas as pd
+
+            if self._X_columns is not None:
+                X_ = pd.DataFrame(output[0], columns=self._X_columns)
+                X_ = X_.astype(self._X_dtypes)
+            else:
+                X_ = output[0]
+
+            y_ = (label_binarize(output[1], np.unique(y))
+                if binarize_y else output[1])
+
+            if self._y_name is not None:
+                y_ = pd.Series(y_, dtype=self._y_dtype, name=self._y_name)
         else:
-            X_ = output[0]
+            X_, y_ = output[0], output[1]
 
-        if self.validate and binarize_y:
-            y_sampled = label_binarize(output[1], np.unique(y))
+        if binarize_y:
             if len(output) == 2:
-                return X_, y_sampled
-            return X_, y_sampled, output[2]
+                return X_, y_
+            return X_, y_, output[2]
         if len(output) == 2:
-            return X_, output[1]
-        return X_, output[1], output[2]
+            return X_, y_
+        return X_, y_, output[2]
 
     def _fit_resample(self, X, y):
         func = _identity if self.func is None else self.func
diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py
@@ -75,8 +75,22 @@ def __init__(self, sampling_strategy="auto", random_state=None):
         self.random_state = random_state
 
     def _check_X_y(self, X, y):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -81,8 +81,22 @@ def __init__(
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
-        # store the columns name to reconstruct a dataframe
-        self._columns = X.columns if hasattr(X, "loc") else None
+        if hasattr(X, "loc"):
+            # store information to build dataframe
+            self._X_columns = X.columns
+            self._X_dtypes = X.dtypes
+        else:
+            self._X_columns = None
+            self._X_dtypes = None
+
+        if hasattr(y, "loc"):
+            # store information to build a series
+            self._y_name = y.name
+            self._y_dtype = y.dtype
+        else:
+            self._y_name = None
+            self._y_dtype = None
+
         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
         X = check_array(X, accept_sparse=["csr", "csc"], dtype=None,
                         force_all_finite=False)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -242,6 +242,7 @@ def check_samplers_pandas(name, Sampler):
         random_state=0,
     )
     X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
+    y_pd = pd.Series(y, name="class")
     sampler = Sampler()
     if isinstance(Sampler(), NearMiss):
         samplers = [Sampler(version=version) for version in (1, 2, 3)]
@@ -251,14 +252,16 @@ def check_samplers_pandas(name, Sampler):
 
     for sampler in samplers:
         set_random_state(sampler)
-        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y)
+        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
         X_res, y_res = sampler.fit_resample(X, y)
 
         # check that we return a pandas dataframe if a dataframe was given in
         assert isinstance(X_res_pd, pd.DataFrame)
+        assert isinstance(y_res_pd, pd.Series)
         assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
+        assert y_pd.name == y_res_pd.name
         assert_allclose(X_res_pd.to_numpy(), X_res)
-        assert_allclose(y_res_pd, y_res)
+        assert_allclose(y_res_pd.to_numpy(), y_res)
 
 
 def check_samplers_multiclass_ova(name, Sampler):