From ee7a42e3b3ffe1eddb78744a841931051ced4cfa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 23 Aug 2018 12:22:37 +0200
Subject: [PATCH 1/7] EHN: random sampler can sample from heterogeneous data

---
 doc/over_sampling.rst                         | 16 ++++++++++
 doc/under_sampling.rst                        | 13 +++++++++
 doc/whats_new/v0.0.4.rst                      |  5 ++++
 imblearn/base.py                              | 29 +++++++++++--------
 imblearn/combine/smote_enn.py                 | 12 ++------
 imblearn/combine/smote_tomek.py               | 12 ++------
 imblearn/ensemble/base.py                     |  2 +-
 imblearn/over_sampling/random_over_sampler.py |  9 +++++-
 .../tests/test_random_over_sampler.py         | 14 +++++++++
 .../random_under_sampler.py                   | 10 ++++++-
 .../tests/test_random_under_sampler.py        | 13 ++++++++-
 imblearn/utils/validation.py                  |  9 ++++--
 12 files changed, 106 insertions(+), 38 deletions(-)
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
index a100750ee..296e44e4b 100644
--- a/doc/over_sampling.rst
+++ b/doc/over_sampling.rst
@@ -52,6 +52,22 @@ As a result, the majority class does not take over the other classes during the
 training process. Consequently, all classes are represented by the decision
 function.
 
+In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
+(e.g. containing some strings)::
+
+  >>> import numpy as np
+  >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+  ...                     dtype=np.object)
+  >>> y_hetero = np.array([0, 0, 1])
+  >>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero)
+  >>> print(X_resampled)
+  [['xxx' 1 1.0]
+   ['yyy' 2 2.0]
+   ['zzz' 3 3.0]
+   ['zzz' 3 3.0]]
+  >>> print(y_resampled)
+  [0 0 1 1]
+
 See :ref:`sphx_glr_auto_examples_over-sampling_plot_random_over_sampling.py`
 for usage example.
 
diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
index f2412528e..2582b7e6d 100644
--- a/doc/under_sampling.rst
+++ b/doc/under_sampling.rst
@@ -103,6 +103,19 @@ by considering independently each targeted class::
   >>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
   (181, 2)
 
+In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
+(e.g. containing some strings)::
+
+  >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+  ...                     dtype=np.object)
+  >>> y = np.array([0, 0, 1])
+  >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y)
+  >>> print(X_resampled)
+  [['xxx' 1 1.0]
+   ['zzz' 3 3.0]]
+  >>> print(y_resampled)
+  [0 1]
+
 See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`.,
 :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`,
 and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py`.
diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
index 4e9858f48..90bbe629c 100644
--- a/doc/whats_new/v0.0.4.rst
+++ b/doc/whats_new/v0.0.4.rst
@@ -45,6 +45,11 @@ Enhancement
   :issue:`439` by :user:`Hugo Gascon<hgascon>` and
   :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Allow :class:`imblearn.under_sampling.RandomUnderSampler` and
+  :class:`imblearn.over_sampling.RandomOverSampler` to sample object array
+  containing strings.
+  :issue:`448` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Bug fixes
 .........
 
diff --git a/imblearn/base.py b/imblearn/base.py
index dbfe08070..eb2800b01 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -31,13 +31,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):
 
     _estimator_type = 'sampler'
 
-    def _check_X_y(self, X, y):
-        """Private function to check that the X and y in fitting are the same
-        than in sampling."""
-        X_hash, y_hash = hash_X_y(X, y)
-        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
-            raise RuntimeError("X and y need to be same array earlier fitted.")
-
     def sample(self, X, y):
         """Resample the dataset.
 
@@ -60,11 +53,10 @@ def sample(self, X, y):
 
         """
         # Check the consistency of X and y
-        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y, binarize_y = self._check_X_y(X, y)
 
         check_is_fitted(self, 'sampling_strategy_')
-        self._check_X_y(X, y)
+        self._check_X_y_hash(X, y)
 
         output = self._sample(X, y)
 
@@ -151,6 +143,19 @@ def __init__(self, sampling_strategy='auto', ratio=None):
         self.ratio = ratio
         self.logger = logging.getLogger(self.__module__)
 
+    @staticmethod
+    def _check_X_y(X, y):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        return X, y, binarize_y
+
+    def _check_X_y_hash(self, X, y):
+        """Private function to check that the X and y in fitting are the same
+        than in sampling."""
+        X_hash, y_hash = hash_X_y(X, y)
+        if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
+            raise RuntimeError("X and y need to be same array earlier fitted.")
+
     @property
     def ratio_(self):
         # FIXME: remove in 0.6
@@ -183,9 +188,9 @@ def fit(self, X, y):
 
         """
         self._deprecate_ratio()
-        y = check_target_type(y)
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
+        X, y, _ = self._check_X_y(X, y)
         self.X_hash_, self.y_hash_ = hash_X_y(X, y)
+        # _sampling_type is defined in the children base class
         self.sampling_strategy_ = check_sampling_strategy(
             self.sampling_strategy, y, self._sampling_type)
 
diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
index 21d8a0a57..942acec54 100644
--- a/imblearn/combine/smote_enn.py
+++ b/imblearn/combine/smote_enn.py
@@ -12,7 +12,7 @@
 from sklearn.base import clone
 from sklearn.utils import check_X_y
 
-from ..base import SamplerMixin
+from ..base import BaseSampler
 from ..over_sampling import SMOTE
 from ..over_sampling.base import BaseOverSampler
 from ..under_sampling import EditedNearestNeighbours
@@ -24,7 +24,7 @@
 @Substitution(
     sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring)
-class SMOTEENN(SamplerMixin):
+class SMOTEENN(BaseSampler):
     """Class to perform over-sampling using SMOTE and cleaning using ENN.
 
     Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours.
@@ -125,14 +125,6 @@ def _validate_estimator(self):
         else:
             self.enn_ = EditedNearestNeighbours(sampling_strategy='all')
 
-    @property
-    def ratio_(self):
-        # FIXME: remove in 0.6
-        warnings.warn("'ratio' and 'ratio_' are deprecated. Use "
-                      "'sampling_strategy' and 'sampling_strategy_' instead.",
-                      DeprecationWarning)
-        return self.sampling_strategy_
-
     def fit(self, X, y):
         """Find the classes statistics before to perform sampling.
 
diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
index 99001e814..0a53cdf2a 100644
--- a/imblearn/combine/smote_tomek.py
+++ b/imblearn/combine/smote_tomek.py
@@ -13,7 +13,7 @@
 from sklearn.base import clone
 from sklearn.utils import check_X_y
 
-from ..base import SamplerMixin
+from ..base import BaseSampler
 from ..over_sampling import SMOTE
 from ..over_sampling.base import BaseOverSampler
 from ..under_sampling import TomekLinks
@@ -25,7 +25,7 @@
 @Substitution(
     sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
     random_state=_random_state_docstring)
-class SMOTETomek(SamplerMixin):
+class SMOTETomek(BaseSampler):
     """Class to perform over-sampling using SMOTE and cleaning using
     Tomek links.
 
@@ -133,14 +133,6 @@ def _validate_estimator(self):
         else:
             self.tomek_ = TomekLinks(sampling_strategy='all')
 
-    @property
-    def ratio_(self):
-        # FIXME: remove in 0.6
-        warnings.warn("'ratio' and 'ratio_' are deprecated. Use "
-                      "'sampling_strategy' and 'sampling_strategy_' instead.",
-                      DeprecationWarning)
-        return self.sampling_strategy_
-
     def fit(self, X, y):
         """Find the classes statistics before to perform sampling.
 
diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py
index 5e24c5d56..ed012d9db 100644
--- a/imblearn/ensemble/base.py
+++ b/imblearn/ensemble/base.py
@@ -60,7 +60,7 @@ def sample(self, X, y):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
 
         check_is_fitted(self, 'sampling_strategy_')
-        self._check_X_y(X, y)
+        self._check_X_y_hash(X, y)
 
         output = self._sample(X, y)
 
diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py
index 35181e387..d0ead9479 100644
--- a/imblearn/over_sampling/random_over_sampler.py
+++ b/imblearn/over_sampling/random_over_sampler.py
@@ -8,9 +8,10 @@
 from collections import Counter
 
 import numpy as np
-from sklearn.utils import check_random_state, safe_indexing
+from sklearn.utils import check_X_y, check_random_state, safe_indexing
 
 from .base import BaseOverSampler
+from ..utils import check_target_type
 from ..utils import Substitution
 from ..utils._docstring import _random_state_docstring
 
@@ -79,6 +80,12 @@ def __init__(self, sampling_strategy='auto',
         self.return_indices = return_indices
         self.random_state = random_state
 
+    @staticmethod
+    def _check_X_y(X, y):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
+        return X, y, binarize_y
+
     def _sample(self, X, y):
         """Resample the dataset.
 
diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
index 6b7ed686c..494719601 100644
--- a/imblearn/over_sampling/tests/test_random_over_sampler.py
+++ b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -7,6 +7,7 @@
 
 from collections import Counter
 
+import pytest
 import numpy as np
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_equal
@@ -88,3 +89,16 @@ def test_multiclass_fit_sample():
     assert count_y_res[0] == 5
     assert count_y_res[1] == 5
     assert count_y_res[2] == 5
+
+
+def test_random_over_sampling_heterogeneous_data():
+    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+                        dtype=np.object)
+    y = np.array([0, 0, 1])
+    ros = RandomOverSampler(random_state=RND_SEED)
+    X_res, y_res = ros.fit_sample(X_hetero, y)
+
+    assert X_res.shape[0] == 4
+    assert y_res.shape[0] == 4
+    assert X_res.dtype == object
+    assert X_res[-1, 0] in X_hetero[:, 0]
\ No newline at end of file
diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
index ee7c303e0..24f7b84c1 100644
--- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py
+++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
@@ -7,9 +7,11 @@
 from __future__ import division
 
 import numpy as np
-from sklearn.utils import check_random_state, safe_indexing
+
+from sklearn.utils import check_X_y, check_random_state, safe_indexing
 
 from ..base import BaseUnderSampler
+from ...utils import check_target_type
 from ...utils import Substitution
 from ...utils._docstring import _random_state_docstring
 
@@ -82,6 +84,12 @@ def __init__(self,
         self.return_indices = return_indices
         self.replacement = replacement
 
+    @staticmethod
+    def _check_X_y(X, y):
+        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
+        return X, y, binarize_y
+
     def _sample(self, X, y):
         """Resample the dataset.
 
diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py
index 962cd12fb..e36dce79e 100644
--- a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py
+++ b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py
@@ -63,7 +63,6 @@ def test_rus_fit_sample_half():
                      [0.15490546, 0.3130677], [0.20792588, 1.49407907],
                      [0.15490546, 0.3130677], [0.12372842, 0.6536186]])
     y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
-    print(X_resampled)
     assert_array_equal(X_resampled, X_gt)
     assert_array_equal(y_resampled, y_gt)
 
@@ -78,3 +77,15 @@ def test_multiclass_fit_sample():
     assert count_y_res[0] == 2
     assert count_y_res[1] == 2
     assert count_y_res[2] == 2
+
+
+def test_random_under_sampling_heterogeneous_data():
+    X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
+                        dtype=np.object)
+    y = np.array([0, 0, 1])
+    rus = RandomUnderSampler(random_state=RND_SEED)
+    X_res, y_res = rus.fit_sample(X_hetero, y)
+
+    assert X_res.shape[0] == 2
+    assert y_res.shape[0] == 2
+    assert X_res.dtype == object
\ No newline at end of file
diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py
index 1c5dc1a08..89749a6f5 100644
--- a/imblearn/utils/validation.py
+++ b/imblearn/utils/validation.py
@@ -100,7 +100,7 @@ def hash_X_y(X, y, n_samples=10, n_features=5):
 
     Parameters
     ----------
-    X : ndarray, shape (n_samples, n_features)
+    X : array_like, shape (n_samples, n_features)
         The ``X`` array.
 
     y : ndarray, shape (n_samples)
@@ -122,7 +122,12 @@ def hash_X_y(X, y, n_samples=10, n_features=5):
     row_idx = slice(None, None, max(1, X.shape[0] // n_samples))
     col_idx = slice(None, None, max(1, X.shape[1] // n_features))
 
-    return joblib.hash(X[row_idx, col_idx]), joblib.hash(y[row_idx])
+    if hasattr(X, 'iloc'):
+        X_hash = joblib.hash(X.iloc[row_idx, col_idx])
+    else:
+        X_hash = joblib.hash(X[row_idx, col_idx])
+
+    return X_hash, joblib.hash(y[row_idx])
 
 
 def _sampling_strategy_all(y, sampling_type):

From f5ea4ae9f9fe8dc62bac95d9bea1de3f7690fbd1 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 23 Aug 2018 12:24:08 +0200
Subject: [PATCH 2/7] PEP8

---
 imblearn/over_sampling/tests/test_random_over_sampler.py        | 2 +-
 .../prototype_selection/tests/test_random_under_sampler.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
index 494719601..687941097 100644
--- a/imblearn/over_sampling/tests/test_random_over_sampler.py
+++ b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -101,4 +101,4 @@ def test_random_over_sampling_heterogeneous_data():
     assert X_res.shape[0] == 4
     assert y_res.shape[0] == 4
     assert X_res.dtype == object
-    assert X_res[-1, 0] in X_hetero[:, 0]
\ No newline at end of file
+    assert X_res[-1, 0] in X_hetero[:, 0]
diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py
index e36dce79e..109bf0235 100644
--- a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py
+++ b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py
@@ -88,4 +88,4 @@ def test_random_under_sampling_heterogeneous_data():
 
     assert X_res.shape[0] == 2
     assert y_res.shape[0] == 2
-    assert X_res.dtype == object
\ No newline at end of file
+    assert X_res.dtype == object

From 8bbf0a7648ac18b9f85973ed0e2deefb1e3c2aca Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 23 Aug 2018 14:01:10 +0200
Subject: [PATCH 3/7] monkey patch the check_dtype_object from sklearn

---
 imblearn/utils/estimator_checks.py | 34 +++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 3bb52d46d..1c87d2182 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -16,6 +16,7 @@
 import numpy as np
 from scipy import sparse
 
+from sklearn.base import clone
 from sklearn.datasets import make_classification
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import label_binarize
@@ -35,6 +36,33 @@
 from imblearn.utils.testing import warns
 
 DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE']
+SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler']
+
+
+def monkey_patch_check_dtype_object(name, estimator_orig):
+    # check that estimators treat dtype object as numeric if possible
+    rng = np.random.RandomState(0)
+    X = rng.rand(40, 10).astype(object)
+    y = np.array([0] * 10 + [1] * 30, dtype=np.int)
+    estimator = clone(estimator_orig)
+
+    estimator.fit(X, y)
+    if hasattr(estimator, "sample"):
+        estimator.sample(X, y)
+
+    try:
+        estimator.fit(X, y.astype(object))
+    except Exception as e:
+        if "Unknown label type" not in str(e):
+            raise
+
+    if name not in SUPPORT_STRING:
+        X[0, 0] = {'foo': 'bar'}
+        msg = "argument must be a string or a number"
+        with pytest.raises(TypeError, match=msg):
+            estimator.fit(X, y)
+    else:
+        estimator.fit(X, y)
 
 
 def _yield_sampler_checks(name, Estimator):
@@ -73,7 +101,11 @@ def check_estimator(Estimator):
         Class to check. Estimator is a class object (not an instance).
     """
     name = Estimator.__name__
-    # test scikit-learn compatibility
+    # monkey patch check_dtype_object for the sampler allowing strings
+    import sklearn.utils.estimator_checks
+    sklearn.utils.estimator_checks.check_dtype_object = \
+        monkey_patch_check_dtype_object
+    # scikit-learn common tests
     sklearn_check_estimator(Estimator)
     check_parameters_default_constructible(name, Estimator)
     for check in _yield_all_checks(name, Estimator):

From 5886b7ed8ea7d30105c887a02d5f819f06b2f663 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 23 Aug 2018 14:25:06 +0200
Subject: [PATCH 4/7] iter

---
 imblearn/over_sampling/tests/test_random_over_sampler.py | 1 -
 imblearn/utils/estimator_checks.py                       | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
index 687941097..c9bd37a42 100644
--- a/imblearn/over_sampling/tests/test_random_over_sampler.py
+++ b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -7,7 +7,6 @@
 
 from collections import Counter
 
-import pytest
 import numpy as np
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_equal
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index 1c87d2182..5504ccfe3 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -24,6 +24,7 @@
     as sklearn_check_estimator, check_parameters_default_constructible
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import set_random_state
 from sklearn.utils.multiclass import type_of_target
 
@@ -59,8 +60,7 @@ def monkey_patch_check_dtype_object(name, estimator_orig):
     if name not in SUPPORT_STRING:
         X[0, 0] = {'foo': 'bar'}
         msg = "argument must be a string or a number"
-        with pytest.raises(TypeError, match=msg):
-            estimator.fit(X, y)
+        assert_raises_regex(TypeError, msg, estimator.fit, X, y)
     else:
         estimator.fit(X, y)
 

From 3c6bf7a87ae3bdbe91e1e06adc7d683930c552c9 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 23 Aug 2018 14:42:13 +0200
Subject: [PATCH 5/7] fix doc

---
 doc/under_sampling.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst
index 2582b7e6d..a45375c4b 100644
--- a/doc/under_sampling.rst
+++ b/doc/under_sampling.rst
@@ -108,8 +108,8 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
 
   >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
   ...                     dtype=np.object)
-  >>> y = np.array([0, 0, 1])
-  >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y)
+  >>> y_hetero = np.array([0, 0, 1])
+  >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero)
   >>> print(X_resampled)
   [['xxx' 1 1.0]
    ['zzz' 3 3.0]]

From 8d3fd3ccaf12f90e83640f6cc9cc479416a51a61 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 23 Aug 2018 14:44:10 +0200
Subject: [PATCH 6/7] improve documentation

---
 imblearn/over_sampling/random_over_sampler.py                   | 2 ++
 .../under_sampling/prototype_selection/random_under_sampler.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py
index d0ead9479..73cca1c66 100644
--- a/imblearn/over_sampling/random_over_sampler.py
+++ b/imblearn/over_sampling/random_over_sampler.py
@@ -45,6 +45,8 @@ class RandomOverSampler(BaseOverSampler):
     Notes
     -----
     Supports multi-class resampling by sampling each class independently.
+    Supports heterogeneous data as object array containing string and numeric
+    data.
 
     See
     :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`,
diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
index 24f7b84c1..3b3c7691d 100644
--- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py
+++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py
@@ -48,6 +48,8 @@ class RandomUnderSampler(BaseUnderSampler):
     Notes
     -----
     Supports multi-class resampling by sampling each class independently.
+    Supports heterogeneous data as object array containing string and numeric
+    data.
 
     See
     :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py` and

From 5539a72b7810292faa4187fe5cc6e5487a71d810 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 23 Aug 2018 14:57:47 +0200
Subject: [PATCH 7/7] additional tests

---
 imblearn/utils/tests/test_validation.py | 14 ++++++++++++++
 imblearn/utils/validation.py            | 10 +++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py
index 99424d12b..b09b3b03c 100644
--- a/imblearn/utils/tests/test_validation.py
+++ b/imblearn/utils/tests/test_validation.py
@@ -375,6 +375,20 @@ def test_hash_X_y():
     assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
 
 
+def test_hash_X_y_pandas():
+    pd = pytest.importorskip("pandas")
+    rng = check_random_state(0)
+    X = pd.DataFrame(rng.randn(2000, 20))
+    y = pd.Series([0] * 500 + [1] * 1500)
+    assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]),
+                                      joblib.hash(y.iloc[::200]))
+
+    X = pd.DataFrame(rng.randn(5, 2))
+    y = pd.Series([0] * 2 + [1] * 3)
+    # all data will be used in this case
+    assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
+
+
 @pytest.mark.parametrize(
     "sampling_strategy, sampling_type, expected_result",
     [({3: 25, 1: 25, 2: 25}, 'under-sampling',
diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py
index 89749a6f5..bc6d9ecc9 100644
--- a/imblearn/utils/validation.py
+++ b/imblearn/utils/validation.py
@@ -122,12 +122,12 @@ def hash_X_y(X, y, n_samples=10, n_features=5):
     row_idx = slice(None, None, max(1, X.shape[0] // n_samples))
     col_idx = slice(None, None, max(1, X.shape[1] // n_features))
 
-    if hasattr(X, 'iloc'):
-        X_hash = joblib.hash(X.iloc[row_idx, col_idx])
-    else:
-        X_hash = joblib.hash(X[row_idx, col_idx])
+    X_subset = (X.iloc[row_idx, col_idx]
+                if hasattr(X, 'iloc') else X[row_idx, col_idx])
+    y_subset = (y.iloc[row_idx]
+                if hasattr(y, 'iloc') else y[row_idx])
 
-    return X_hash, joblib.hash(y[row_idx])
+    return joblib.hash(X_subset), joblib.hash(y_subset)
 
 
 def _sampling_strategy_all(y, sampling_type):