EHN: random sampler can sample from heterogeneous data (#451)

glemaitre · web-flow · commit 6916fe960005 · 2018-08-23T15:08:40.000+02:00
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -52,6 +52,22 @@ As a result, the majority class does not take over the other classes during the
 training process. Consequently, all classes are represented by the decision
 function.
 
+In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
+(e.g. containing some strings)::
+
+  >>> import numpy as np
+  >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+  ...                     dtype=np.object)
+  >>> y_hetero = np.array([0, 0, 1])
+  >>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero)
+  >>> print(X_resampled)
+  [['xxx' 1 1.0]
+   ['yyy' 2 2.0]
+   ['zzz' 3 3.0]
+   ['zzz' 3 3.0]]
+  >>> print(y_resampled)
+  [0 0 1 1]
+
 See :ref:`sphx_glr_auto_examples_over-sampling_plot_random_over_sampling.py`
 for usage example.
 
diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
@@ -103,6 +103,19 @@ by considering independently each targeted class::
   >>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
   (181, 2)
 
+In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
+(e.g. containing some strings)::
+
+  >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+  ...                     dtype=np.object)
+  >>> y_hetero = np.array([0, 0, 1])
+  >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero)
+  >>> print(X_resampled)
+  [['xxx' 1 1.0]
+   ['zzz' 3 3.0]]
+  >>> print(y_resampled)
+  [0 1]
+
 See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`.,
 :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`,
 and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py`.
diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
@@ -45,6 +45,11 @@ Enhancement
   :issue:`439` by :user:`Hugo Gascon<hgascon>` and
   :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Allow :class:`imblearn.under_sampling.RandomUnderSampler` and
+  :class:`imblearn.over_sampling.RandomOverSampler` to sample object array
+  containing strings.
+  :issue:`448` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Bug fixes
 .........
 
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -31,13 +31,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
 
     _estimator_type = 'sampler'
 
-    def _check_X_y(self, X, y):
-        """Private function to check that the X and y in fitting are the same
-        than in sampling."""
-        X_hash, y_hash = hash_X_y(X, y)
-        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
-            raise RuntimeError("X and y need to be same array earlier fitted.")
-
     def sample(self, X, y):
         """Resample the dataset.
 
@@ -60,11 +53,10 @@ def sample(self, X, y):
 
         """
         # Check the consistency of X and y
-        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y, binarize_y = self._check_X_y(X, y)
 
         check_is_fitted(self, 'sampling_strategy_')
-        self._check_X_y(X, y)
+        self._check_X_y_hash(X, y)
 
         output = self._sample(X, y)
 
@@ -151,6 +143,19 @@ def __init__(self, sampling_strategy='auto', ratio=None):
         self.ratio = ratio
         self.logger = logging.getLogger(self.__module__)
 
+    @staticmethod
+    def _check_X_y(X, y):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        return X, y, binarize_y
+
+    def _check_X_y_hash(self, X, y):
+        """Private function to check that the X and y in fitting are the same
+        than in sampling."""
+        X_hash, y_hash = hash_X_y(X, y)
+        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
+            raise RuntimeError("X and y need to be same array earlier fitted.")
+
     @property
     def ratio_(self):
         # FIXME: remove in 0.6
@@ -183,9 +188,9 @@ def fit(self, X, y):
 
         """
         self._deprecate_ratio()
-        y = check_target_type(y)
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y, _ = self._check_X_y(X, y)
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
+        # _sampling_type is defined in the children base class
         self.sampling_strategy_ = check_sampling_strategy(
             self.sampling_strategy, y, self._sampling_type)
 
diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
@@ -12,7 +12,7 @@
 from sklearn.base import clone
 from sklearn.utils import check_X_y
 
-from ..base import SamplerMixin
+from ..base import BaseSampler
 from ..over_sampling import SMOTE
 from ..over_sampling.base import BaseOverSampler
 from ..under_sampling import EditedNearestNeighbours
@@ -24,7 +24,7 @@
 @Substitution(
     sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring)
-class SMOTEENN(SamplerMixin):
+class SMOTEENN(BaseSampler):
     """Class to perform over-sampling using SMOTE and cleaning using ENN.
 
     Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours.
@@ -125,14 +125,6 @@ def _validate_estimator(self):
         else:
             self.enn_ = EditedNearestNeighbours(sampling_strategy='all')
 
-    @property
-    def ratio_(self):
-        # FIXME: remove in 0.6
-        warnings.warn("'ratio' and 'ratio_' are deprecated. Use "
-                      "'sampling_strategy' and 'sampling_strategy_' instead.",
-                      DeprecationWarning)
-        return self.sampling_strategy_
-
     def fit(self, X, y):
         """Find the classes statistics before to perform sampling.
 
diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
@@ -13,7 +13,7 @@
 from sklearn.base import clone
 from sklearn.utils import check_X_y
 
-from ..base import SamplerMixin
+from ..base import BaseSampler
 from ..over_sampling import SMOTE
 from ..over_sampling.base import BaseOverSampler
 from ..under_sampling import TomekLinks
@@ -25,7 +25,7 @@
 @Substitution(
     sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring)
-class SMOTETomek(SamplerMixin):
+class SMOTETomek(BaseSampler):
     """Class to perform over-sampling using SMOTE and cleaning using
     Tomek links.
 
@@ -133,14 +133,6 @@ def _validate_estimator(self):
         else:
             self.tomek_ = TomekLinks(sampling_strategy='all')
 
-    @property
-    def ratio_(self):
-        # FIXME: remove in 0.6
-        warnings.warn("'ratio' and 'ratio_' are deprecated. Use "
-                      "'sampling_strategy' and 'sampling_strategy_' instead.",
-                      DeprecationWarning)
-        return self.sampling_strategy_
-
     def fit(self, X, y):
         """Find the classes statistics before to perform sampling.
 
diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py
@@ -60,7 +60,7 @@ def sample(self, X, y):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
 
         check_is_fitted(self, 'sampling_strategy_')
-        self._check_X_y(X, y)
+        self._check_X_y_hash(X, y)
 
         output = self._sample(X, y)
 
diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py
@@ -8,9 +8,10 @@
 from collections import Counter
 
 import numpy as np
-from sklearn.utils import check_random_state, safe_indexing
+from sklearn.utils import check_X_y, check_random_state, safe_indexing
 
 from .base import BaseOverSampler
+from ..utils import check_target_type
 from ..utils import Substitution
 from ..utils._docstring import _random_state_docstring
 
@@ -44,6 +45,8 @@ class RandomOverSampler(BaseOverSampler):
     Notes
     -----
     Supports multi-class resampling by sampling each class independently.
+    Supports heterogeneous data as object array containing string and numeric
+    data.
 
     See
     :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`,
@@ -79,6 +82,12 @@ def __init__(self, sampling_strategy='auto',
         self.return_indices = return_indices
         self.random_state = random_state
 
+    @staticmethod
+    def _check_X_y(X, y):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
+        return X, y, binarize_y
+
     def _sample(self, X, y):
         """Resample the dataset.
 
diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -88,3 +88,16 @@ def test_multiclass_fit_sample():
     assert count_y_res[0] == 5
     assert count_y_res[1] == 5
     assert count_y_res[2] == 5
+
+
+def test_random_over_sampling_heterogeneous_data():
+    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+                        dtype=np.object)
+    y = np.array([0, 0, 1])
+    ros = RandomOverSampler(random_state=RND_SEED)
+    X_res, y_res = ros.fit_sample(X_hetero, y)
+
+    assert X_res.shape[0] == 4
+    assert y_res.shape[0] == 4
+    assert X_res.dtype == object
+    assert X_res[-1, 0] in X_hetero[:, 0]
diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
@@ -7,9 +7,11 @@
 from __future__ import division
 
 import numpy as np
-from sklearn.utils import check_random_state, safe_indexing
+
+from sklearn.utils import check_X_y, check_random_state, safe_indexing
 
 from ..base import BaseUnderSampler
+from ...utils import check_target_type
 from ...utils import Substitution
 from ...utils._docstring import _random_state_docstring
 
@@ -46,6 +48,8 @@ class RandomUnderSampler(BaseUnderSampler):
     Notes
     -----
     Supports multi-class resampling by sampling each class independently.
+    Supports heterogeneous data as object array containing string and numeric
+    data.
 
     See
     :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py` and
@@ -82,6 +86,12 @@ def __init__(self,
         self.return_indices = return_indices
         self.replacement = replacement
 
+    @staticmethod
+    def _check_X_y(X, y):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
+        return X, y, binarize_y
+
     def _sample(self, X, y):
         """Resample the dataset.
 
diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py
@@ -63,7 +63,6 @@ def test_rus_fit_sample_half():
                      [0.15490546, 0.3130677], [0.20792588, 1.49407907],
                      [0.15490546, 0.3130677], [0.12372842, 0.6536186]])
     y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
-    print(X_resampled)
     assert_array_equal(X_resampled, X_gt)
     assert_array_equal(y_resampled, y_gt)
 
@@ -78,3 +77,15 @@ def test_multiclass_fit_sample():
     assert count_y_res[0] == 2
     assert count_y_res[1] == 2
     assert count_y_res[2] == 2
+
+
+def test_random_under_sampling_heterogeneous_data():
+    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+                        dtype=np.object)
+    y = np.array([0, 0, 1])
+    rus = RandomUnderSampler(random_state=RND_SEED)
+    X_res, y_res = rus.fit_sample(X_hetero, y)
+
+    assert X_res.shape[0] == 2
+    assert y_res.shape[0] == 2
+    assert X_res.dtype == object
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -16,13 +16,15 @@
 import numpy as np
 from scipy import sparse
 
+from sklearn.base import clone
 from sklearn.datasets import make_classification
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import label_binarize
 from sklearn.utils.estimator_checks import check_estimator \
     as sklearn_check_estimator, check_parameters_default_constructible
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import set_random_state
 from sklearn.utils.multiclass import type_of_target
 
@@ -35,6 +37,32 @@
 from imblearn.utils.testing import warns
 
 DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE']
+SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler']
+
+
+def monkey_patch_check_dtype_object(name, estimator_orig):
+    # check that estimators treat dtype object as numeric if possible
+    rng = np.random.RandomState(0)
+    X = rng.rand(40, 10).astype(object)
+    y = np.array([0] * 10 + [1] * 30, dtype=np.int)
+    estimator = clone(estimator_orig)
+
+    estimator.fit(X, y)
+    if hasattr(estimator, "sample"):
+        estimator.sample(X, y)
+
+    try:
+        estimator.fit(X, y.astype(object))
+    except Exception as e:
+        if "Unknown label type" not in str(e):
+            raise
+
+    if name not in SUPPORT_STRING:
+        X[0, 0] = {'foo': 'bar'}
+        msg = "argument must be a string or a number"
+        assert_raises_regex(TypeError, msg, estimator.fit, X, y)
+    else:
+        estimator.fit(X, y)
 
 
 def _yield_sampler_checks(name, Estimator):
@@ -74,7 +102,11 @@ def check_estimator(Estimator):
         Class to check. Estimator is a class object (not an instance).
     """
     name = Estimator.__name__
-    # test scikit-learn compatibility
+    # monkey patch check_dtype_object for the sampler allowing strings
+    import sklearn.utils.estimator_checks
+    sklearn.utils.estimator_checks.check_dtype_object = \
+        monkey_patch_check_dtype_object
+    # scikit-learn common tests
     sklearn_check_estimator(Estimator)
     check_parameters_default_constructible(name, Estimator)
     for check in _yield_all_checks(name, Estimator):
diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py
@@ -375,6 +375,20 @@ def test_hash_X_y():
     assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
 
 
+def test_hash_X_y_pandas():
+    pd = pytest.importorskip("pandas")
+    rng = check_random_state(0)
+    X = pd.DataFrame(rng.randn(2000, 20))
+    y = pd.Series([0] * 500 + [1] * 1500)
+    assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]),
+                                      joblib.hash(y.iloc[::200]))
+
+    X = pd.DataFrame(rng.randn(5, 2))
+    y = pd.Series([0] * 2 + [1] * 3)
+    # all data will be used in this case
+    assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
+
+
 @pytest.mark.parametrize(
     "sampling_strategy, sampling_type, expected_result",
     [({3: 25, 1: 25, 2: 25}, 'under-sampling',
diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py