From ee7a42e3b3ffe1eddb78744a841931051ced4cfa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 12:22:37 +0200 Subject: [PATCH 1/7] EHN: random sampler can sample from heterogeneous data --- doc/over_sampling.rst | 16 ++++++++++ doc/under_sampling.rst | 13 +++++++++ doc/whats_new/v0.0.4.rst | 5 ++++ imblearn/base.py | 29 +++++++++++-------- imblearn/combine/smote_enn.py | 12 ++------ imblearn/combine/smote_tomek.py | 12 ++------ imblearn/ensemble/base.py | 2 +- imblearn/over_sampling/random_over_sampler.py | 9 +++++- .../tests/test_random_over_sampler.py | 14 +++++++++ .../random_under_sampler.py | 10 ++++++- .../tests/test_random_under_sampler.py | 13 ++++++++- imblearn/utils/validation.py | 9 ++++-- 12 files changed, 106 insertions(+), 38 deletions(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index a100750ee..296e44e4b 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -52,6 +52,22 @@ As a result, the majority class does not take over the other classes during the training process. Consequently, all classes are represented by the decision function. +In addition, :class:`RandomOverSampler` allows to sample heterogeneous data +(e.g. containing some strings):: + + >>> import numpy as np + >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + ... dtype=np.object) + >>> y_hetero = np.array([0, 0, 1]) + >>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero) + >>> print(X_resampled) + [['xxx' 1 1.0] + ['yyy' 2 2.0] + ['zzz' 3 3.0] + ['zzz' 3 3.0]] + >>> print(y_resampled) + [0 0 1 1] + See :ref:`sphx_glr_auto_examples_over-sampling_plot_random_over_sampling.py` for usage example. diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index f2412528e..2582b7e6d 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -103,6 +103,19 @@ by considering independently each targeted class:: >>> print(np.vstack({tuple(row) for row in X_resampled}).shape) (181, 2) +In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data +(e.g. containing some strings):: + + >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + ... dtype=np.object) + >>> y = np.array([0, 0, 1]) + >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y) + >>> print(X_resampled) + [['xxx' 1 1.0] + ['zzz' 3 3.0]] + >>> print(y_resampled) + [0 1] + See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`., :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`, and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py`. diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 4e9858f48..90bbe629c 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -45,6 +45,11 @@ Enhancement :issue:`439` by :user:`Hugo Gascon` and :user:`Guillaume Lemaitre `. +- Allow :class:`imblearn.under_sampling.RandomUnderSampler` and + :class:`imblearn.over_sampling.RandomOverSampler` to sample object array + containing strings. + :issue:`448` by :user:`Guillaume Lemaitre `. + Bug fixes ......... diff --git a/imblearn/base.py b/imblearn/base.py index dbfe08070..eb2800b01 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -31,13 +31,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = 'sampler' - def _check_X_y(self, X, y): - """Private function to check that the X and y in fitting are the same - than in sampling.""" - X_hash, y_hash = hash_X_y(X, y) - if self.X_hash_ != X_hash or self.y_hash_ != y_hash: - raise RuntimeError("X and y need to be same array earlier fitted.") - def sample(self, X, y): """Resample the dataset. @@ -60,11 +53,10 @@ def sample(self, X, y): """ # Check the consistency of X and y - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y, binarize_y = self._check_X_y(X, y) check_is_fitted(self, 'sampling_strategy_') - self._check_X_y(X, y) + self._check_X_y_hash(X, y) output = self._sample(X, y) @@ -151,6 +143,19 @@ def __init__(self, sampling_strategy='auto', ratio=None): self.ratio = ratio self.logger = logging.getLogger(self.__module__) + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + return X, y, binarize_y + + def _check_X_y_hash(self, X, y): + """Private function to check that the X and y in fitting are the same + than in sampling.""" + X_hash, y_hash = hash_X_y(X, y) + if self.X_hash_ != X_hash or self.y_hash_ != y_hash: + raise RuntimeError("X and y need to be same array earlier fitted.") + @property def ratio_(self): # FIXME: remove in 0.6 @@ -183,9 +188,9 @@ def fit(self, X, y): """ self._deprecate_ratio() - y = check_target_type(y) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y, _ = self._check_X_y(X, y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) + # _sampling_type is defined in the children base class self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 21d8a0a57..942acec54 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -12,7 +12,7 @@ from sklearn.base import clone from sklearn.utils import check_X_y -from ..base import SamplerMixin +from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import EditedNearestNeighbours @@ -24,7 +24,7 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring) -class SMOTEENN(SamplerMixin): +class SMOTEENN(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. @@ -125,14 +125,6 @@ def _validate_estimator(self): else: self.enn_ = EditedNearestNeighbours(sampling_strategy='all') - @property - def ratio_(self): - # FIXME: remove in 0.6 - warnings.warn("'ratio' and 'ratio_' are deprecated. Use " - "'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - def fit(self, X, y): """Find the classes statistics before to perform sampling. diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 99001e814..0a53cdf2a 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -13,7 +13,7 @@ from sklearn.base import clone from sklearn.utils import check_X_y -from ..base import SamplerMixin +from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import TomekLinks @@ -25,7 +25,7 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring) -class SMOTETomek(SamplerMixin): +class SMOTETomek(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. @@ -133,14 +133,6 @@ def _validate_estimator(self): else: self.tomek_ = TomekLinks(sampling_strategy='all') - @property - def ratio_(self): - # FIXME: remove in 0.6 - warnings.warn("'ratio' and 'ratio_' are deprecated. Use " - "'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - def fit(self, X, y): """Find the classes statistics before to perform sampling. diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index 5e24c5d56..ed012d9db 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -60,7 +60,7 @@ def sample(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'sampling_strategy_') - self._check_X_y(X, y) + self._check_X_y_hash(X, y) output = self._sample(X, y) diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index 35181e387..d0ead9479 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -8,9 +8,10 @@ from collections import Counter import numpy as np -from sklearn.utils import check_random_state, safe_indexing +from sklearn.utils import check_X_y, check_random_state, safe_indexing from .base import BaseOverSampler +from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring @@ -79,6 +80,12 @@ def __init__(self, sampling_strategy='auto', self.return_indices = return_indices self.random_state = random_state + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) + return X, y, binarize_y + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 6b7ed686c..494719601 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -7,6 +7,7 @@ from collections import Counter +import pytest import numpy as np from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal @@ -88,3 +89,16 @@ def test_multiclass_fit_sample(): assert count_y_res[0] == 5 assert count_y_res[1] == 5 assert count_y_res[2] == 5 + + +def test_random_over_sampling_heterogeneous_data(): + X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + dtype=np.object) + y = np.array([0, 0, 1]) + ros = RandomOverSampler(random_state=RND_SEED) + X_res, y_res = ros.fit_sample(X_hetero, y) + + assert X_res.shape[0] == 4 + assert y_res.shape[0] == 4 + assert X_res.dtype == object + assert X_res[-1, 0] in X_hetero[:, 0] \ No newline at end of file diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index ee7c303e0..24f7b84c1 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -7,9 +7,11 @@ from __future__ import division import numpy as np -from sklearn.utils import check_random_state, safe_indexing + +from sklearn.utils import check_X_y, check_random_state, safe_indexing from ..base import BaseUnderSampler +from ...utils import check_target_type from ...utils import Substitution from ...utils._docstring import _random_state_docstring @@ -82,6 +84,12 @@ def __init__(self, self.return_indices = return_indices self.replacement = replacement + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) + return X, y, binarize_y + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py index 962cd12fb..e36dce79e 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py @@ -63,7 +63,6 @@ def test_rus_fit_sample_half(): [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) - print(X_resampled) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -78,3 +77,15 @@ def test_multiclass_fit_sample(): assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2 + + +def test_random_under_sampling_heterogeneous_data(): + X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + dtype=np.object) + y = np.array([0, 0, 1]) + rus = RandomUnderSampler(random_state=RND_SEED) + X_res, y_res = rus.fit_sample(X_hetero, y) + + assert X_res.shape[0] == 2 + assert y_res.shape[0] == 2 + assert X_res.dtype == object \ No newline at end of file diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 1c5dc1a08..89749a6f5 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -100,7 +100,7 @@ def hash_X_y(X, y, n_samples=10, n_features=5): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : array_like, shape (n_samples, n_features) The ``X`` array. y : ndarray, shape (n_samples) @@ -122,7 +122,12 @@ def hash_X_y(X, y, n_samples=10, n_features=5): row_idx = slice(None, None, max(1, X.shape[0] // n_samples)) col_idx = slice(None, None, max(1, X.shape[1] // n_features)) - return joblib.hash(X[row_idx, col_idx]), joblib.hash(y[row_idx]) + if hasattr(X, 'iloc'): + X_hash = joblib.hash(X.iloc[row_idx, col_idx]) + else: + X_hash = joblib.hash(X[row_idx, col_idx]) + + return X_hash, joblib.hash(y[row_idx]) def _sampling_strategy_all(y, sampling_type): From f5ea4ae9f9fe8dc62bac95d9bea1de3f7690fbd1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 12:24:08 +0200 Subject: [PATCH 2/7] PEP8 --- imblearn/over_sampling/tests/test_random_over_sampler.py | 2 +- .../prototype_selection/tests/test_random_under_sampler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 494719601..687941097 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -101,4 +101,4 @@ def test_random_over_sampling_heterogeneous_data(): assert X_res.shape[0] == 4 assert y_res.shape[0] == 4 assert X_res.dtype == object - assert X_res[-1, 0] in X_hetero[:, 0] \ No newline at end of file + assert X_res[-1, 0] in X_hetero[:, 0] diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py index e36dce79e..109bf0235 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py @@ -88,4 +88,4 @@ def test_random_under_sampling_heterogeneous_data(): assert X_res.shape[0] == 2 assert y_res.shape[0] == 2 - assert X_res.dtype == object \ No newline at end of file + assert X_res.dtype == object From 8bbf0a7648ac18b9f85973ed0e2deefb1e3c2aca Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 14:01:10 +0200 Subject: [PATCH 3/7] monkey patch the check_dtype_object from sklearn --- imblearn/utils/estimator_checks.py | 34 +++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 3bb52d46d..1c87d2182 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -16,6 +16,7 @@ import numpy as np from scipy import sparse +from sklearn.base import clone from sklearn.datasets import make_classification from sklearn.cluster import KMeans from sklearn.preprocessing import label_binarize @@ -35,6 +36,33 @@ from imblearn.utils.testing import warns DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] +SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] + + +def monkey_patch_check_dtype_object(name, estimator_orig): + # check that estimators treat dtype object as numeric if possible + rng = np.random.RandomState(0) + X = rng.rand(40, 10).astype(object) + y = np.array([0] * 10 + [1] * 30, dtype=np.int) + estimator = clone(estimator_orig) + + estimator.fit(X, y) + if hasattr(estimator, "sample"): + estimator.sample(X, y) + + try: + estimator.fit(X, y.astype(object)) + except Exception as e: + if "Unknown label type" not in str(e): + raise + + if name not in SUPPORT_STRING: + X[0, 0] = {'foo': 'bar'} + msg = "argument must be a string or a number" + with pytest.raises(TypeError, match=msg): + estimator.fit(X, y) + else: + estimator.fit(X, y) def _yield_sampler_checks(name, Estimator): @@ -73,7 +101,11 @@ def check_estimator(Estimator): Class to check. Estimator is a class object (not an instance). """ name = Estimator.__name__ - # test scikit-learn compatibility + # monkey patch check_dtype_object for the sampler allowing strings + import sklearn.utils.estimator_checks + sklearn.utils.estimator_checks.check_dtype_object = \ + monkey_patch_check_dtype_object + # scikit-learn common tests sklearn_check_estimator(Estimator) check_parameters_default_constructible(name, Estimator) for check in _yield_all_checks(name, Estimator): From 5886b7ed8ea7d30105c887a02d5f819f06b2f663 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 14:25:06 +0200 Subject: [PATCH 4/7] iter --- imblearn/over_sampling/tests/test_random_over_sampler.py | 1 - imblearn/utils/estimator_checks.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 687941097..c9bd37a42 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -7,7 +7,6 @@ from collections import Counter -import pytest import numpy as np from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 1c87d2182..5504ccfe3 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -24,6 +24,7 @@ as sklearn_check_estimator, check_parameters_default_constructible from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import set_random_state from sklearn.utils.multiclass import type_of_target @@ -59,8 +60,7 @@ def monkey_patch_check_dtype_object(name, estimator_orig): if name not in SUPPORT_STRING: X[0, 0] = {'foo': 'bar'} msg = "argument must be a string or a number" - with pytest.raises(TypeError, match=msg): - estimator.fit(X, y) + assert_raises_regex(TypeError, msg, estimator.fit, X, y) else: estimator.fit(X, y) From 3c6bf7a87ae3bdbe91e1e06adc7d683930c552c9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 14:42:13 +0200 Subject: [PATCH 5/7] fix doc --- doc/under_sampling.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 2582b7e6d..a45375c4b 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -108,8 +108,8 @@ In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=np.object) - >>> y = np.array([0, 0, 1]) - >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y) + >>> y_hetero = np.array([0, 0, 1]) + >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero) >>> print(X_resampled) [['xxx' 1 1.0] ['zzz' 3 3.0]] From 8d3fd3ccaf12f90e83640f6cc9cc479416a51a61 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 14:44:10 +0200 Subject: [PATCH 6/7] improve documentation --- imblearn/over_sampling/random_over_sampler.py | 2 ++ .../under_sampling/prototype_selection/random_under_sampler.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index d0ead9479..73cca1c66 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -45,6 +45,8 @@ class RandomOverSampler(BaseOverSampler): Notes ----- Supports multi-class resampling by sampling each class independently. + Supports heterogeneous data as object array containing string and numeric + data. See :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`, diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index 24f7b84c1..3b3c7691d 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -48,6 +48,8 @@ class RandomUnderSampler(BaseUnderSampler): Notes ----- Supports multi-class resampling by sampling each class independently. + Supports heterogeneous data as object array containing string and numeric + data. See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py` and From 5539a72b7810292faa4187fe5cc6e5487a71d810 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 14:57:47 +0200 Subject: [PATCH 7/7] additional tests --- imblearn/utils/tests/test_validation.py | 14 ++++++++++++++ imblearn/utils/validation.py | 10 +++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 99424d12b..b09b3b03c 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -375,6 +375,20 @@ def test_hash_X_y(): assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y)) +def test_hash_X_y_pandas(): + pd = pytest.importorskip("pandas") + rng = check_random_state(0) + X = pd.DataFrame(rng.randn(2000, 20)) + y = pd.Series([0] * 500 + [1] * 1500) + assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]), + joblib.hash(y.iloc[::200])) + + X = pd.DataFrame(rng.randn(5, 2)) + y = pd.Series([0] * 2 + [1] * 3) + # all data will be used in this case + assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y)) + + @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_result", [({3: 25, 1: 25, 2: 25}, 'under-sampling', diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 89749a6f5..bc6d9ecc9 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -122,12 +122,12 @@ def hash_X_y(X, y, n_samples=10, n_features=5): row_idx = slice(None, None, max(1, X.shape[0] // n_samples)) col_idx = slice(None, None, max(1, X.shape[1] // n_features)) - if hasattr(X, 'iloc'): - X_hash = joblib.hash(X.iloc[row_idx, col_idx]) - else: - X_hash = joblib.hash(X[row_idx, col_idx]) + X_subset = (X.iloc[row_idx, col_idx] + if hasattr(X, 'iloc') else X[row_idx, col_idx]) + y_subset = (y.iloc[row_idx] + if hasattr(y, 'iloc') else y[row_idx]) - return X_hash, joblib.hash(y[row_idx]) + return joblib.hash(X_subset), joblib.hash(y_subset) def _sampling_strategy_all(y, sampling_type):