From be52de961c3d7d9204e5e9f0e733d003f050e64f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 16 Nov 2019 23:31:32 +0100 Subject: [PATCH 1/3] ENH accept non finite values in random samplers --- doc/whats_new/v0.6.rst | 5 +++++ .../over_sampling/_random_over_sampler.py | 3 ++- .../tests/test_random_over_sampler.py | 20 +++++++++++++++++++ .../_random_under_sampler.py | 3 ++- .../tests/test_random_under_sampler.py | 20 +++++++++++++++++++ 5 files changed, 49 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst index 17461c58d..efb0880a5 100644 --- a/doc/whats_new/v0.6.rst +++ b/doc/whats_new/v0.6.rst @@ -51,6 +51,11 @@ Enhancement to check or not the input ``X`` and ``y``. :pr:`637` by :user:`Guillaume Lemaitre `. +- :class:`imblearn.under_sampling.RandomUnderSampler`, + :class:`imblearn.over_sampling.RandomOverSampler` can resample when non + finite values are present in ``X``. + :pr:`xxx` by `Guillaume Lemaitre `. + Deprecation ........... diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 7988b73fd..e36d20974 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -79,7 +79,8 @@ def _check_X_y(X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) if not hasattr(X, "loc"): # Do not convert dataframe - X = check_array(X, accept_sparse=["csr", "csc"], dtype=None) + X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, + force_all_finite=False) y = check_array( y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False ) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index e93a9f205..ca58a8012 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -125,3 +125,23 @@ def test_random_over_sampling_heterogeneous_data(): assert y_res.shape[0] == 4 assert X_res.dtype == object assert X_res[-1, 0] in X_hetero[:, 0] + + +def test_random_over_sampling_nan_inf(): + # check that we can oversample even with missing or infinite data + # regression tests for #605 + rng = np.random.RandomState(42) + n_not_finite = X.shape[0] // 3 + row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite) + col_indices = rng.randint(0, X.shape[1], size=n_not_finite) + not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite) + + X_ = X.copy() + X_[row_indices, col_indices] = not_finite_values + + ros = RandomOverSampler(random_state=0) + X_res, y_res = ros.fit_resample(X_, Y) + + assert y_res.shape == (14,) + assert X_res.shape == (14, 2) + assert np.any(~np.isfinite(X_res)) diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index 0a5b582be..ba8ac215b 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -85,7 +85,8 @@ def _check_X_y(X, y): y, binarize_y = check_target_type(y, indicate_one_vs_all=True) if not hasattr(X, "loc"): # Do not convert dataframe - X = check_array(X, accept_sparse=["csr", "csc"], dtype=None) + X = check_array(X, accept_sparse=["csr", "csc"], dtype=None, + force_all_finite=False) y = check_array( y, accept_sparse=["csr", "csc"], dtype=None, ensure_2d=False ) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py index 7e1998d20..945d31fec 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py @@ -110,3 +110,23 @@ def test_random_under_sampling_heterogeneous_data(): assert X_res.shape[0] == 2 assert y_res.shape[0] == 2 assert X_res.dtype == object + + +def test_random_under_sampling_nan_inf(): + # check that we can undersample even with missing or infinite data + # regression tests for #605 + rng = np.random.RandomState(42) + n_not_finite = X.shape[0] // 3 + row_indices = rng.choice(np.arange(X.shape[0]), size=n_not_finite) + col_indices = rng.randint(0, X.shape[1], size=n_not_finite) + not_finite_values = rng.choice([np.nan, np.inf], size=n_not_finite) + + X_ = X.copy() + X_[row_indices, col_indices] = not_finite_values + + rus = RandomUnderSampler(random_state=0) + X_res, y_res = rus.fit_resample(X_, Y) + + assert y_res.shape == (6,) + assert X_res.shape == (6, 2) + assert np.any(~np.isfinite(X_res)) From 9678c2cd15f61b760f656ea55acb7d774fe9c1a9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 16 Nov 2019 23:33:10 +0100 Subject: [PATCH 2/3] update pr number --- doc/whats_new/v0.6.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst index efb0880a5..e5aedaf72 100644 --- a/doc/whats_new/v0.6.rst +++ b/doc/whats_new/v0.6.rst @@ -54,7 +54,7 @@ Enhancement - :class:`imblearn.under_sampling.RandomUnderSampler`, :class:`imblearn.over_sampling.RandomOverSampler` can resample when non finite values are present in ``X``. - :pr:`xxx` by `Guillaume Lemaitre `. + :pr:`643` by `Guillaume Lemaitre `. Deprecation ........... From 4e110ef71c7aaddce54fbe432a782288e8993df7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 16 Nov 2019 23:46:42 +0100 Subject: [PATCH 3/3] add allow_nan as an estimator tag --- imblearn/over_sampling/_random_over_sampler.py | 6 +++++- .../_prototype_selection/_random_under_sampler.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index e36d20974..953f16641 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -109,4 +109,8 @@ def _fit_resample(self, X, y): ) def _more_tags(self): - return {"X_types": ["2darray", "string"], "sample_indices": True} + return { + "X_types": ["2darray", "string"], + "sample_indices": True, + "allow_nan": True, + } diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index ba8ac215b..02f014f58 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -122,4 +122,8 @@ def _fit_resample(self, X, y): return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) def _more_tags(self): - return {"X_types": ["2darray", "string"], "sample_indices": True} + return { + "X_types": ["2darray", "string"], + "sample_indices": True, + "allow_nan": True, + }