Skip to content

EHN: random sampler can sample from heterogeneous data #451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions doc/over_sampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,22 @@ As a result, the majority class does not take over the other classes during the
training process. Consequently, all classes are represented by the decision
function.

In addition, :class:`RandomOverSampler` allows to sample heterogeneous data
(e.g. containing some strings)::

>>> import numpy as np
>>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
... dtype=np.object)
>>> y_hetero = np.array([0, 0, 1])
>>> X_resampled, y_resampled = ros.fit_sample(X_hetero, y_hetero)
>>> print(X_resampled)
[['xxx' 1 1.0]
['yyy' 2 2.0]
['zzz' 3 3.0]
['zzz' 3 3.0]]
>>> print(y_resampled)
[0 0 1 1]

See :ref:`sphx_glr_auto_examples_over-sampling_plot_random_over_sampling.py`
for usage example.

Expand Down
13 changes: 13 additions & 0 deletions doc/under_sampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,19 @@ by considering independently each targeted class::
>>> print(np.vstack({tuple(row) for row in X_resampled}).shape)
(181, 2)

In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data
(e.g. containing some strings)::

>>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
... dtype=np.object)
>>> y_hetero = np.array([0, 0, 1])
>>> X_resampled, y_resampled = rus.fit_sample(X_hetero, y_hetero)
>>> print(X_resampled)
[['xxx' 1 1.0]
['zzz' 3 3.0]]
>>> print(y_resampled)
[0 1]

See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`.,
:ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`,
and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py`.
Expand Down
5 changes: 5 additions & 0 deletions doc/whats_new/v0.0.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ Enhancement
:issue:`439` by :user:`Hugo Gascon<hgascon>` and
:user:`Guillaume Lemaitre <glemaitre>`.

- Allow :class:`imblearn.under_sampling.RandomUnderSampler` and
:class:`imblearn.over_sampling.RandomOverSampler` to sample object array
containing strings.
:issue:`448` by :user:`Guillaume Lemaitre <glemaitre>`.

Bug fixes
.........

Expand Down
29 changes: 17 additions & 12 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,6 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)):

_estimator_type = 'sampler'

def _check_X_y(self, X, y):
"""Private function to check that the X and y in fitting are the same
than in sampling."""
X_hash, y_hash = hash_X_y(X, y)
if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
raise RuntimeError("X and y need to be same array earlier fitted.")

def sample(self, X, y):
"""Resample the dataset.

Expand All @@ -60,11 +53,10 @@ def sample(self, X, y):

"""
# Check the consistency of X and y
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
X, y, binarize_y = self._check_X_y(X, y)

check_is_fitted(self, 'sampling_strategy_')
self._check_X_y(X, y)
self._check_X_y_hash(X, y)

output = self._sample(X, y)

Expand Down Expand Up @@ -151,6 +143,19 @@ def __init__(self, sampling_strategy='auto', ratio=None):
self.ratio = ratio
self.logger = logging.getLogger(self.__module__)

@staticmethod
def _check_X_y(X, y):
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
return X, y, binarize_y

def _check_X_y_hash(self, X, y):
"""Private function to check that the X and y in fitting are the same
than in sampling."""
X_hash, y_hash = hash_X_y(X, y)
if self.X_hash_ != X_hash or self.y_hash_ != y_hash:
raise RuntimeError("X and y need to be same array earlier fitted.")

@property
def ratio_(self):
# FIXME: remove in 0.6
Expand Down Expand Up @@ -183,9 +188,9 @@ def fit(self, X, y):

"""
self._deprecate_ratio()
y = check_target_type(y)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
X, y, _ = self._check_X_y(X, y)
self.X_hash_, self.y_hash_ = hash_X_y(X, y)
# _sampling_type is defined in the children base class
self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type)

Expand Down
12 changes: 2 additions & 10 deletions imblearn/combine/smote_enn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sklearn.base import clone
from sklearn.utils import check_X_y

from ..base import SamplerMixin
from ..base import BaseSampler
from ..over_sampling import SMOTE
from ..over_sampling.base import BaseOverSampler
from ..under_sampling import EditedNearestNeighbours
Expand All @@ -24,7 +24,7 @@
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
random_state=_random_state_docstring)
class SMOTEENN(SamplerMixin):
class SMOTEENN(BaseSampler):
"""Class to perform over-sampling using SMOTE and cleaning using ENN.

Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours.
Expand Down Expand Up @@ -125,14 +125,6 @@ def _validate_estimator(self):
else:
self.enn_ = EditedNearestNeighbours(sampling_strategy='all')

@property
def ratio_(self):
# FIXME: remove in 0.6
warnings.warn("'ratio' and 'ratio_' are deprecated. Use "
"'sampling_strategy' and 'sampling_strategy_' instead.",
DeprecationWarning)
return self.sampling_strategy_

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.

Expand Down
12 changes: 2 additions & 10 deletions imblearn/combine/smote_tomek.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sklearn.base import clone
from sklearn.utils import check_X_y

from ..base import SamplerMixin
from ..base import BaseSampler
from ..over_sampling import SMOTE
from ..over_sampling.base import BaseOverSampler
from ..under_sampling import TomekLinks
Expand All @@ -25,7 +25,7 @@
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
random_state=_random_state_docstring)
class SMOTETomek(SamplerMixin):
class SMOTETomek(BaseSampler):
"""Class to perform over-sampling using SMOTE and cleaning using
Tomek links.

Expand Down Expand Up @@ -133,14 +133,6 @@ def _validate_estimator(self):
else:
self.tomek_ = TomekLinks(sampling_strategy='all')

@property
def ratio_(self):
# FIXME: remove in 0.6
warnings.warn("'ratio' and 'ratio_' are deprecated. Use "
"'sampling_strategy' and 'sampling_strategy_' instead.",
DeprecationWarning)
return self.sampling_strategy_

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.

Expand Down
2 changes: 1 addition & 1 deletion imblearn/ensemble/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def sample(self, X, y):
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])

check_is_fitted(self, 'sampling_strategy_')
self._check_X_y(X, y)
self._check_X_y_hash(X, y)

output = self._sample(X, y)

Expand Down
11 changes: 10 additions & 1 deletion imblearn/over_sampling/random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
from collections import Counter

import numpy as np
from sklearn.utils import check_random_state, safe_indexing
from sklearn.utils import check_X_y, check_random_state, safe_indexing

from .base import BaseOverSampler
from ..utils import check_target_type
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring

Expand Down Expand Up @@ -44,6 +45,8 @@ class RandomOverSampler(BaseOverSampler):
Notes
-----
Supports multi-class resampling by sampling each class independently.
Supports heterogeneous data as object array containing string and numeric
data.

See
:ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`,
Expand Down Expand Up @@ -79,6 +82,12 @@ def __init__(self, sampling_strategy='auto',
self.return_indices = return_indices
self.random_state = random_state

@staticmethod
def _check_X_y(X, y):
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
return X, y, binarize_y

def _sample(self, X, y):
"""Resample the dataset.

Expand Down
13 changes: 13 additions & 0 deletions imblearn/over_sampling/tests/test_random_over_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,16 @@ def test_multiclass_fit_sample():
assert count_y_res[0] == 5
assert count_y_res[1] == 5
assert count_y_res[2] == 5


def test_random_over_sampling_heterogeneous_data():
X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
dtype=np.object)
y = np.array([0, 0, 1])
ros = RandomOverSampler(random_state=RND_SEED)
X_res, y_res = ros.fit_sample(X_hetero, y)

assert X_res.shape[0] == 4
assert y_res.shape[0] == 4
assert X_res.dtype == object
assert X_res[-1, 0] in X_hetero[:, 0]
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
from __future__ import division

import numpy as np
from sklearn.utils import check_random_state, safe_indexing

from sklearn.utils import check_X_y, check_random_state, safe_indexing

from ..base import BaseUnderSampler
from ...utils import check_target_type
from ...utils import Substitution
from ...utils._docstring import _random_state_docstring

Expand Down Expand Up @@ -46,6 +48,8 @@ class RandomUnderSampler(BaseUnderSampler):
Notes
-----
Supports multi-class resampling by sampling each class independently.
Supports heterogeneous data as object array containing string and numeric
data.

See
:ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py` and
Expand Down Expand Up @@ -82,6 +86,12 @@ def __init__(self,
self.return_indices = return_indices
self.replacement = replacement

@staticmethod
def _check_X_y(X, y):
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
return X, y, binarize_y

def _sample(self, X, y):
"""Resample the dataset.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def test_rus_fit_sample_half():
[0.15490546, 0.3130677], [0.20792588, 1.49407907],
[0.15490546, 0.3130677], [0.12372842, 0.6536186]])
y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
print(X_resampled)
assert_array_equal(X_resampled, X_gt)
assert_array_equal(y_resampled, y_gt)

Expand All @@ -78,3 +77,15 @@ def test_multiclass_fit_sample():
assert count_y_res[0] == 2
assert count_y_res[1] == 2
assert count_y_res[2] == 2


def test_random_under_sampling_heterogeneous_data():
X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]],
dtype=np.object)
y = np.array([0, 0, 1])
rus = RandomUnderSampler(random_state=RND_SEED)
X_res, y_res = rus.fit_sample(X_hetero, y)

assert X_res.shape[0] == 2
assert y_res.shape[0] == 2
assert X_res.dtype == object
34 changes: 33 additions & 1 deletion imblearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
import numpy as np
from scipy import sparse

from sklearn.base import clone
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from sklearn.preprocessing import label_binarize
from sklearn.utils.estimator_checks import check_estimator \
as sklearn_check_estimator, check_parameters_default_constructible
from sklearn.exceptions import NotFittedError
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.testing import set_random_state
from sklearn.utils.multiclass import type_of_target

Expand All @@ -35,6 +37,32 @@
from imblearn.utils.testing import warns

DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE']
SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler']


def monkey_patch_check_dtype_object(name, estimator_orig):
# check that estimators treat dtype object as numeric if possible
rng = np.random.RandomState(0)
X = rng.rand(40, 10).astype(object)
y = np.array([0] * 10 + [1] * 30, dtype=np.int)
estimator = clone(estimator_orig)

estimator.fit(X, y)
if hasattr(estimator, "sample"):
estimator.sample(X, y)

try:
estimator.fit(X, y.astype(object))
except Exception as e:
if "Unknown label type" not in str(e):
raise

if name not in SUPPORT_STRING:
X[0, 0] = {'foo': 'bar'}
msg = "argument must be a string or a number"
assert_raises_regex(TypeError, msg, estimator.fit, X, y)
else:
estimator.fit(X, y)


def _yield_sampler_checks(name, Estimator):
Expand Down Expand Up @@ -73,7 +101,11 @@ def check_estimator(Estimator):
Class to check. Estimator is a class object (not an instance).
"""
name = Estimator.__name__
# test scikit-learn compatibility
# monkey patch check_dtype_object for the sampler allowing strings
import sklearn.utils.estimator_checks
sklearn.utils.estimator_checks.check_dtype_object = \
monkey_patch_check_dtype_object
# scikit-learn common tests
sklearn_check_estimator(Estimator)
check_parameters_default_constructible(name, Estimator)
for check in _yield_all_checks(name, Estimator):
Expand Down
14 changes: 14 additions & 0 deletions imblearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,20 @@ def test_hash_X_y():
assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))


def test_hash_X_y_pandas():
pd = pytest.importorskip("pandas")
rng = check_random_state(0)
X = pd.DataFrame(rng.randn(2000, 20))
y = pd.Series([0] * 500 + [1] * 1500)
assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]),
joblib.hash(y.iloc[::200]))

X = pd.DataFrame(rng.randn(5, 2))
y = pd.Series([0] * 2 + [1] * 3)
# all data will be used in this case
assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))


@pytest.mark.parametrize(
"sampling_strategy, sampling_type, expected_result",
[({3: 25, 1: 25, 2: 25}, 'under-sampling',
Expand Down
Loading