diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index a100750ee..296e44e4b 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -52,6 +52,22 @@ As a result, the majority class does not take over the other classes during the training process. Consequently, all classes are represented by the decision function. +In addition, :class:`RandomOverSampler` allows to sample heterogeneous data +(e.g. containing some strings):: + + >>> import numpy as np + >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + ... dtype=np.object) + >>> y_hetero = np.array([0, 0, 1]) + >>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero) + >>> print(X_resampled) + [['xxx' 1 1.0] + ['yyy' 2 2.0] + ['zzz' 3 3.0] + ['zzz' 3 3.0]] + >>> print(y_resampled) + [0 0 1 1] + See :ref:`sphx_glr_auto_examples_over-sampling_plot_random_over_sampling.py` for usage example. diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index f2412528e..a45375c4b 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -103,6 +103,19 @@ by considering independently each targeted class:: >>> print(np.vstack({tuple(row) for row in X_resampled}).shape) (181, 2) +In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data +(e.g. containing some strings):: + + >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + ... dtype=np.object) + >>> y_hetero = np.array([0, 0, 1]) + >>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero) + >>> print(X_resampled) + [['xxx' 1 1.0] + ['zzz' 3 3.0]] + >>> print(y_resampled) + [0 1] + See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`., :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`, and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py`. diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 4e9858f48..90bbe629c 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -45,6 +45,11 @@ Enhancement :issue:`439` by :user:`Hugo Gascon` and :user:`Guillaume Lemaitre `. +- Allow :class:`imblearn.under_sampling.RandomUnderSampler` and + :class:`imblearn.over_sampling.RandomOverSampler` to sample object array + containing strings. + :issue:`448` by :user:`Guillaume Lemaitre `. + Bug fixes ......... diff --git a/imblearn/base.py b/imblearn/base.py index dbfe08070..eb2800b01 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -31,13 +31,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = 'sampler' - def _check_X_y(self, X, y): - """Private function to check that the X and y in fitting are the same - than in sampling.""" - X_hash, y_hash = hash_X_y(X, y) - if self.X_hash_ != X_hash or self.y_hash_ != y_hash: - raise RuntimeError("X and y need to be same array earlier fitted.") - def sample(self, X, y): """Resample the dataset. @@ -60,11 +53,10 @@ def sample(self, X, y): """ # Check the consistency of X and y - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y, binarize_y = self._check_X_y(X, y) check_is_fitted(self, 'sampling_strategy_') - self._check_X_y(X, y) + self._check_X_y_hash(X, y) output = self._sample(X, y) @@ -151,6 +143,19 @@ def __init__(self, sampling_strategy='auto', ratio=None): self.ratio = ratio self.logger = logging.getLogger(self.__module__) + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + return X, y, binarize_y + + def _check_X_y_hash(self, X, y): + """Private function to check that the X and y in fitting are the same + than in sampling.""" + X_hash, y_hash = hash_X_y(X, y) + if self.X_hash_ != X_hash or self.y_hash_ != y_hash: + raise RuntimeError("X and y need to be same array earlier fitted.") + @property def ratio_(self): # FIXME: remove in 0.6 @@ -183,9 +188,9 @@ def fit(self, X, y): """ self._deprecate_ratio() - y = check_target_type(y) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y, _ = self._check_X_y(X, y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) + # _sampling_type is defined in the children base class self.sampling_strategy_ = check_sampling_strategy( self.sampling_strategy, y, self._sampling_type) diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 21d8a0a57..942acec54 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -12,7 +12,7 @@ from sklearn.base import clone from sklearn.utils import check_X_y -from ..base import SamplerMixin +from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import EditedNearestNeighbours @@ -24,7 +24,7 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring) -class SMOTEENN(SamplerMixin): +class SMOTEENN(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. @@ -125,14 +125,6 @@ def _validate_estimator(self): else: self.enn_ = EditedNearestNeighbours(sampling_strategy='all') - @property - def ratio_(self): - # FIXME: remove in 0.6 - warnings.warn("'ratio' and 'ratio_' are deprecated. Use " - "'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - def fit(self, X, y): """Find the classes statistics before to perform sampling. diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 99001e814..0a53cdf2a 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -13,7 +13,7 @@ from sklearn.base import clone from sklearn.utils import check_X_y -from ..base import SamplerMixin +from ..base import BaseSampler from ..over_sampling import SMOTE from ..over_sampling.base import BaseOverSampler from ..under_sampling import TomekLinks @@ -25,7 +25,7 @@ @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, random_state=_random_state_docstring) -class SMOTETomek(SamplerMixin): +class SMOTETomek(BaseSampler): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. @@ -133,14 +133,6 @@ def _validate_estimator(self): else: self.tomek_ = TomekLinks(sampling_strategy='all') - @property - def ratio_(self): - # FIXME: remove in 0.6 - warnings.warn("'ratio' and 'ratio_' are deprecated. Use " - "'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - def fit(self, X, y): """Find the classes statistics before to perform sampling. diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py index 5e24c5d56..ed012d9db 100644 --- a/imblearn/ensemble/base.py +++ b/imblearn/ensemble/base.py @@ -60,7 +60,7 @@ def sample(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'sampling_strategy_') - self._check_X_y(X, y) + self._check_X_y_hash(X, y) output = self._sample(X, y) diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index 35181e387..73cca1c66 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -8,9 +8,10 @@ from collections import Counter import numpy as np -from sklearn.utils import check_random_state, safe_indexing +from sklearn.utils import check_X_y, check_random_state, safe_indexing from .base import BaseOverSampler +from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring @@ -44,6 +45,8 @@ class RandomOverSampler(BaseOverSampler): Notes ----- Supports multi-class resampling by sampling each class independently. + Supports heterogeneous data as object array containing string and numeric + data. See :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`, @@ -79,6 +82,12 @@ def __init__(self, sampling_strategy='auto', self.return_indices = return_indices self.random_state = random_state + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) + return X, y, binarize_y + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 6b7ed686c..c9bd37a42 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -88,3 +88,16 @@ def test_multiclass_fit_sample(): assert count_y_res[0] == 5 assert count_y_res[1] == 5 assert count_y_res[2] == 5 + + +def test_random_over_sampling_heterogeneous_data(): + X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + dtype=np.object) + y = np.array([0, 0, 1]) + ros = RandomOverSampler(random_state=RND_SEED) + X_res, y_res = ros.fit_sample(X_hetero, y) + + assert X_res.shape[0] == 4 + assert y_res.shape[0] == 4 + assert X_res.dtype == object + assert X_res[-1, 0] in X_hetero[:, 0] diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index ee7c303e0..3b3c7691d 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -7,9 +7,11 @@ from __future__ import division import numpy as np -from sklearn.utils import check_random_state, safe_indexing + +from sklearn.utils import check_X_y, check_random_state, safe_indexing from ..base import BaseUnderSampler +from ...utils import check_target_type from ...utils import Substitution from ...utils._docstring import _random_state_docstring @@ -46,6 +48,8 @@ class RandomUnderSampler(BaseUnderSampler): Notes ----- Supports multi-class resampling by sampling each class independently. + Supports heterogeneous data as object array containing string and numeric + data. See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py` and @@ -82,6 +86,12 @@ def __init__(self, self.return_indices = return_indices self.replacement = replacement + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) + return X, y, binarize_y + def _sample(self, X, y): """Resample the dataset. diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py index 962cd12fb..109bf0235 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py @@ -63,7 +63,6 @@ def test_rus_fit_sample_half(): [0.15490546, 0.3130677], [0.20792588, 1.49407907], [0.15490546, 0.3130677], [0.12372842, 0.6536186]]) y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) - print(X_resampled) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) @@ -78,3 +77,15 @@ def test_multiclass_fit_sample(): assert count_y_res[0] == 2 assert count_y_res[1] == 2 assert count_y_res[2] == 2 + + +def test_random_under_sampling_heterogeneous_data(): + X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + dtype=np.object) + y = np.array([0, 0, 1]) + rus = RandomUnderSampler(random_state=RND_SEED) + X_res, y_res = rus.fit_sample(X_hetero, y) + + assert X_res.shape[0] == 2 + assert y_res.shape[0] == 2 + assert X_res.dtype == object diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 3bb52d46d..5504ccfe3 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -16,6 +16,7 @@ import numpy as np from scipy import sparse +from sklearn.base import clone from sklearn.datasets import make_classification from sklearn.cluster import KMeans from sklearn.preprocessing import label_binarize @@ -23,6 +24,7 @@ as sklearn_check_estimator, check_parameters_default_constructible from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import set_random_state from sklearn.utils.multiclass import type_of_target @@ -35,6 +37,32 @@ from imblearn.utils.testing import warns DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] +SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] + + +def monkey_patch_check_dtype_object(name, estimator_orig): + # check that estimators treat dtype object as numeric if possible + rng = np.random.RandomState(0) + X = rng.rand(40, 10).astype(object) + y = np.array([0] * 10 + [1] * 30, dtype=np.int) + estimator = clone(estimator_orig) + + estimator.fit(X, y) + if hasattr(estimator, "sample"): + estimator.sample(X, y) + + try: + estimator.fit(X, y.astype(object)) + except Exception as e: + if "Unknown label type" not in str(e): + raise + + if name not in SUPPORT_STRING: + X[0, 0] = {'foo': 'bar'} + msg = "argument must be a string or a number" + assert_raises_regex(TypeError, msg, estimator.fit, X, y) + else: + estimator.fit(X, y) def _yield_sampler_checks(name, Estimator): @@ -73,7 +101,11 @@ def check_estimator(Estimator): Class to check. Estimator is a class object (not an instance). """ name = Estimator.__name__ - # test scikit-learn compatibility + # monkey patch check_dtype_object for the sampler allowing strings + import sklearn.utils.estimator_checks + sklearn.utils.estimator_checks.check_dtype_object = \ + monkey_patch_check_dtype_object + # scikit-learn common tests sklearn_check_estimator(Estimator) check_parameters_default_constructible(name, Estimator) for check in _yield_all_checks(name, Estimator): diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 99424d12b..b09b3b03c 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -375,6 +375,20 @@ def test_hash_X_y(): assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y)) +def test_hash_X_y_pandas(): + pd = pytest.importorskip("pandas") + rng = check_random_state(0) + X = pd.DataFrame(rng.randn(2000, 20)) + y = pd.Series([0] * 500 + [1] * 1500) + assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]), + joblib.hash(y.iloc[::200])) + + X = pd.DataFrame(rng.randn(5, 2)) + y = pd.Series([0] * 2 + [1] * 3) + # all data will be used in this case + assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y)) + + @pytest.mark.parametrize( "sampling_strategy, sampling_type, expected_result", [({3: 25, 1: 25, 2: 25}, 'under-sampling', diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 1c5dc1a08..bc6d9ecc9 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -100,7 +100,7 @@ def hash_X_y(X, y, n_samples=10, n_features=5): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : array_like, shape (n_samples, n_features) The ``X`` array. y : ndarray, shape (n_samples) @@ -122,7 +122,12 @@ def hash_X_y(X, y, n_samples=10, n_features=5): row_idx = slice(None, None, max(1, X.shape[0] // n_samples)) col_idx = slice(None, None, max(1, X.shape[1] // n_features)) - return joblib.hash(X[row_idx, col_idx]), joblib.hash(y[row_idx]) + X_subset = (X.iloc[row_idx, col_idx] + if hasattr(X, 'iloc') else X[row_idx, col_idx]) + y_subset = (y.iloc[row_idx] + if hasattr(y, 'iloc') else y[row_idx]) + + return joblib.hash(X_subset), joblib.hash(y_subset) def _sampling_strategy_all(y, sampling_type):