From ce240b6b80e90f81e49c5bd99b9443ef40eb07df Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 23 Aug 2018 11:11:34 +0200 Subject: [PATCH] BUG: Preserve dtype of X and y when generating samples --- doc/whats_new/v0.0.4.rst | 4 ++++ imblearn/over_sampling/adasyn.py | 10 ++++---- imblearn/over_sampling/smote.py | 23 +++++++++++++------ .../prototype_generation/cluster_centroids.py | 6 ++--- imblearn/utils/estimator_checks.py | 18 +++++++++++++++ 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 4e9858f48..6ee5af80f 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -63,6 +63,10 @@ Bug fixes - Force to clone scikit-learn estimator passed as attributes to samplers. :issue:`446` by :user:`Guillaume Lemaitre `. +- Fix bug which was not preserving the dtype of X and y when generating + samples. + issue:`448` by :user:`Guillaume Lemaitre `. + Maintenance ........... diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index f22351240..dab9ed40f 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -185,8 +185,9 @@ def _sample(self, X, y): n_samples_generated += 1 X_new = (sparse.csr_matrix( (samples, (row_indices, col_indices)), - [np.sum(n_samples_generate), X.shape[1]])) - y_new = np.array([class_sample] * np.sum(n_samples_generate)) + [np.sum(n_samples_generate), X.shape[1]], dtype=X.dtype)) + y_new = np.array([class_sample] * np.sum(n_samples_generate), + dtype=y.dtype) else: x_class_gen = [] for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, @@ -201,8 +202,9 @@ def _sample(self, X, y): for step, nn_z in zip(steps, nn_zs) ]) - X_new = np.concatenate(x_class_gen) - y_new = np.array([class_sample] * np.sum(n_samples_generate)) + X_new = np.concatenate(x_class_gen).astype(X.dtype) + y_new = np.array([class_sample] * np.sum(n_samples_generate), + dtype=y.dtype) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index a9418c0b9..189fc56bd 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -52,6 +52,7 @@ def _validate_estimator(self): def _make_samples(self, X, + y_dtype, y_type, nn_data, nn_num, @@ -65,6 +66,9 @@ def _make_samples(self, X : {array-like, sparse matrix}, shape (n_samples, n_features) Points from which the points will be created. + y_dtype : dtype + The data type of the targets. + y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in @@ -108,15 +112,16 @@ def _make_samples(self, col_indices += sample.indices.tolist() samples += sample.data.tolist() else: - X_new = np.zeros((n_samples, X.shape[1])) + X_new = np.zeros((n_samples, X.shape[1]), dtype=X.dtype) for i, (row, col, step) in enumerate(zip(rows, cols, steps)): X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) - y_new = np.array([y_type] * len(samples_indices)) + y_new = np.array([y_type] * len(samples_indices), dtype=y_dtype) if sparse.issparse(X): return (sparse.csr_matrix((samples, (row_indices, col_indices)), - [len(samples_indices), X.shape[1]]), + [len(samples_indices), X.shape[1]], + dtype=X.dtype), y_new) else: return X_new, y_new @@ -301,8 +306,8 @@ def _sample(self, X, y): if self.kind == 'borderline-1': # Create synthetic samples for borderline points. X_new, y_new = self._make_samples( - safe_indexing(X_class, danger_index), class_sample, - X_class, nns, n_samples) + safe_indexing(X_class, danger_index), y.dtype, + class_sample, X_class, nns, n_samples) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) else: @@ -316,6 +321,7 @@ def _sample(self, X, y): # only minority X_new_1, y_new_1 = self._make_samples( safe_indexing(X_class, danger_index), + y.dtype, class_sample, X_class, nns, @@ -327,6 +333,7 @@ def _sample(self, X, y): # class but all over classes. X_new_2, y_new_2 = self._make_samples( safe_indexing(X_class, danger_index), + y.dtype, class_sample, safe_indexing(X, np.flatnonzero(y != class_sample)), nns, @@ -490,6 +497,7 @@ def _sample(self, X, y): X_new_1, y_new_1 = self._make_samples( safe_indexing(support_vector, np.flatnonzero(danger_bool)), + y.dtype, class_sample, X_class, nns, @@ -503,6 +511,7 @@ def _sample(self, X, y): X_new_2, y_new_2 = self._make_samples( safe_indexing(support_vector, np.flatnonzero(safety_bool)), + y.dtype, class_sample, X_class, nns, @@ -738,8 +747,8 @@ def _sample(self, X, y): self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] - X_new, y_new = self._make_samples(X_class, class_sample, X_class, - nns, n_samples, 1.0) + X_new, y_new = self._make_samples(X_class, y.dtype, class_sample, + X_class, nns, n_samples, 1.0) if sparse.issparse(X_new): X_resampled = sparse.vstack([X_resampled, X_new]) diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 888a39561..ed84cab79 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -128,10 +128,10 @@ def _generate_sample(self, X, y, centroids, target_class): X_new = safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): - X_new = sparse.csr_matrix(centroids) + X_new = sparse.csr_matrix(centroids, dtype=X.dtype) else: X_new = centroids - y_new = np.array([target_class] * centroids.shape[0]) + y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype) return X_new, y_new @@ -191,4 +191,4 @@ def _sample(self, X, y): X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) - return X_resampled, np.array(y_resampled) + return X_resampled, np.array(y_resampled, dtype=y.dtype) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 3bb52d46d..8f1a5bfd9 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -49,6 +49,7 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_sparse yield check_samplers_pandas yield check_samplers_multiclass_ova + yield check_samplers_preserve_dtype def _yield_all_checks(name, estimator): @@ -333,3 +334,20 @@ def check_samplers_multiclass_ova(name, Sampler): else: assert type_of_target(y_res_ova) == type_of_target(y_ova) assert_allclose(y_res, y_res_ova.argmax(axis=1)) + + +def check_samplers_preserve_dtype(name, Sampler): + X, y = make_classification( + n_samples=1000, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0) + # Cast X and y to not default dtype + X = X.astype(np.float32) + y = y.astype(np.int32) + sampler = Sampler() + set_random_state(sampler) + X_res, y_res = sampler.fit_sample(X, y) + assert X.dtype == X_res.dtype + assert y.dtype == y_res.dtype