BUG: Preserve dtype of X and y when generating samples (#450)

glemaitre · web-flow · commit 41cd9a6b42bd · 2018-08-23T11:25:24.000+02:00
diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst
@@ -63,6 +63,10 @@ Bug fixes
 - Force to clone scikit-learn estimator passed as attributes to samplers.
   :issue:`446` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Fix bug which was not preserving the dtype of X and y when generating
+  samples.
+  issue:`448` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Maintenance
 ...........
 
diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py
@@ -185,8 +185,9 @@ def _sample(self, X, y):
                             n_samples_generated += 1
                 X_new = (sparse.csr_matrix(
                     (samples, (row_indices, col_indices)),
-                    [np.sum(n_samples_generate), X.shape[1]]))
-                y_new = np.array([class_sample] * np.sum(n_samples_generate))
+                    [np.sum(n_samples_generate), X.shape[1]], dtype=X.dtype))
+                y_new = np.array([class_sample] * np.sum(n_samples_generate),
+                                 dtype=y.dtype)
             else:
                 x_class_gen = []
                 for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index,
@@ -201,8 +202,9 @@ def _sample(self, X, y):
                         for step, nn_z in zip(steps, nn_zs)
                     ])
 
-                X_new = np.concatenate(x_class_gen)
-                y_new = np.array([class_sample] * np.sum(n_samples_generate))
+                X_new = np.concatenate(x_class_gen).astype(X.dtype)
+                y_new = np.array([class_sample] * np.sum(n_samples_generate),
+                                 dtype=y.dtype)
 
             if sparse.issparse(X_new):
                 X_resampled = sparse.vstack([X_resampled, X_new])
diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py
@@ -52,6 +52,7 @@ def _validate_estimator(self):
 
     def _make_samples(self,
                       X,
+                      y_dtype,
                       y_type,
                       nn_data,
                       nn_num,
@@ -65,6 +66,9 @@ def _make_samples(self,
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Points from which the points will be created.
 
+        y_dtype : dtype
+            The data type of the targets.
+
         y_type : str or int
             The minority target value, just so the function can return the
             target values for the synthetic variables with correct length in
@@ -108,15 +112,16 @@ def _make_samples(self,
                     col_indices += sample.indices.tolist()
                     samples += sample.data.tolist()
         else:
-            X_new = np.zeros((n_samples, X.shape[1]))
+            X_new = np.zeros((n_samples, X.shape[1]), dtype=X.dtype)
             for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
                 X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])
 
-        y_new = np.array([y_type] * len(samples_indices))
+        y_new = np.array([y_type] * len(samples_indices), dtype=y_dtype)
 
         if sparse.issparse(X):
             return (sparse.csr_matrix((samples, (row_indices, col_indices)),
-                                      [len(samples_indices), X.shape[1]]),
+                                      [len(samples_indices), X.shape[1]],
+                                      dtype=X.dtype),
                     y_new)
         else:
             return X_new, y_new
@@ -301,8 +306,8 @@ def _sample(self, X, y):
             if self.kind == 'borderline-1':
                 # Create synthetic samples for borderline points.
                 X_new, y_new = self._make_samples(
-                    safe_indexing(X_class, danger_index), class_sample,
-                    X_class, nns, n_samples)
+                    safe_indexing(X_class, danger_index), y.dtype,
+                    class_sample, X_class, nns, n_samples)
                 if sparse.issparse(X_new):
                     X_resampled = sparse.vstack([X_resampled, X_new])
                 else:
@@ -316,6 +321,7 @@ def _sample(self, X, y):
                 # only minority
                 X_new_1, y_new_1 = self._make_samples(
                     safe_indexing(X_class, danger_index),
+                    y.dtype,
                     class_sample,
                     X_class,
                     nns,
@@ -327,6 +333,7 @@ def _sample(self, X, y):
                 # class but all over classes.
                 X_new_2, y_new_2 = self._make_samples(
                     safe_indexing(X_class, danger_index),
+                    y.dtype,
                     class_sample,
                     safe_indexing(X, np.flatnonzero(y != class_sample)),
                     nns,
@@ -490,6 +497,7 @@ def _sample(self, X, y):
 
                 X_new_1, y_new_1 = self._make_samples(
                     safe_indexing(support_vector, np.flatnonzero(danger_bool)),
+                    y.dtype,
                     class_sample,
                     X_class,
                     nns,
@@ -503,6 +511,7 @@ def _sample(self, X, y):
 
                 X_new_2, y_new_2 = self._make_samples(
                     safe_indexing(support_vector, np.flatnonzero(safety_bool)),
+                    y.dtype,
                     class_sample,
                     X_class,
                     nns,
@@ -738,8 +747,8 @@ def _sample(self, X, y):
 
             self.nn_k_.fit(X_class)
             nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
-            X_new, y_new = self._make_samples(X_class, class_sample, X_class,
-                                              nns, n_samples, 1.0)
+            X_new, y_new = self._make_samples(X_class, y.dtype, class_sample,
+                                              X_class, nns, n_samples, 1.0)
 
             if sparse.issparse(X_new):
                 X_resampled = sparse.vstack([X_resampled, X_new])
diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py
@@ -128,10 +128,10 @@ def _generate_sample(self, X, y, centroids, target_class):
             X_new = safe_indexing(X, np.squeeze(indices))
         else:
             if sparse.issparse(X):
-                X_new = sparse.csr_matrix(centroids)
+                X_new = sparse.csr_matrix(centroids, dtype=X.dtype)
             else:
                 X_new = centroids
-        y_new = np.array([target_class] * centroids.shape[0])
+        y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype)
 
         return X_new, y_new
 
@@ -191,4 +191,4 @@ def _sample(self, X, y):
             X_resampled = np.vstack(X_resampled)
         y_resampled = np.hstack(y_resampled)
 
-        return X_resampled, np.array(y_resampled)
+        return X_resampled, np.array(y_resampled, dtype=y.dtype)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -49,6 +49,7 @@ def _yield_sampler_checks(name, Estimator):
     yield check_samplers_sparse
     yield check_samplers_pandas
     yield check_samplers_multiclass_ova
+    yield check_samplers_preserve_dtype
 
 
 def _yield_all_checks(name, estimator):
@@ -333,3 +334,20 @@ def check_samplers_multiclass_ova(name, Sampler):
     else:
         assert type_of_target(y_res_ova) == type_of_target(y_ova)
         assert_allclose(y_res, y_res_ova.argmax(axis=1))
+
+
+def check_samplers_preserve_dtype(name, Sampler):
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=3,
+        n_informative=4,
+        weights=[0.2, 0.3, 0.5],
+        random_state=0)
+    # Cast X and y to not default dtype
+    X = X.astype(np.float32)
+    y = y.astype(np.int32)
+    sampler = Sampler()
+    set_random_state(sampler)
+    X_res, y_res = sampler.fit_sample(X, y)
+    assert X.dtype == X_res.dtype
+    assert y.dtype == y_res.dtype