Skip to content

BUG: Preserve dtype of X and y when generating samples #450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v0.0.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ Bug fixes
- Force to clone scikit-learn estimator passed as attributes to samplers.
:issue:`446` by :user:`Guillaume Lemaitre <glemaitre>`.

- Fix bug which was not preserving the dtype of X and y when generating
samples.
issue:`448` by :user:`Guillaume Lemaitre <glemaitre>`.

Maintenance
...........

Expand Down
10 changes: 6 additions & 4 deletions imblearn/over_sampling/adasyn.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,9 @@ def _sample(self, X, y):
n_samples_generated += 1
X_new = (sparse.csr_matrix(
(samples, (row_indices, col_indices)),
[np.sum(n_samples_generate), X.shape[1]]))
y_new = np.array([class_sample] * np.sum(n_samples_generate))
[np.sum(n_samples_generate), X.shape[1]], dtype=X.dtype))
y_new = np.array([class_sample] * np.sum(n_samples_generate),
dtype=y.dtype)
else:
x_class_gen = []
for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index,
Expand All @@ -201,8 +202,9 @@ def _sample(self, X, y):
for step, nn_z in zip(steps, nn_zs)
])

X_new = np.concatenate(x_class_gen)
y_new = np.array([class_sample] * np.sum(n_samples_generate))
X_new = np.concatenate(x_class_gen).astype(X.dtype)
y_new = np.array([class_sample] * np.sum(n_samples_generate),
dtype=y.dtype)

if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
Expand Down
23 changes: 16 additions & 7 deletions imblearn/over_sampling/smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def _validate_estimator(self):

def _make_samples(self,
X,
y_dtype,
y_type,
nn_data,
nn_num,
Expand All @@ -65,6 +66,9 @@ def _make_samples(self,
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Points from which the points will be created.

y_dtype : dtype
The data type of the targets.

y_type : str or int
The minority target value, just so the function can return the
target values for the synthetic variables with correct length in
Expand Down Expand Up @@ -108,15 +112,16 @@ def _make_samples(self,
col_indices += sample.indices.tolist()
samples += sample.data.tolist()
else:
X_new = np.zeros((n_samples, X.shape[1]))
X_new = np.zeros((n_samples, X.shape[1]), dtype=X.dtype)
for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]])

y_new = np.array([y_type] * len(samples_indices))
y_new = np.array([y_type] * len(samples_indices), dtype=y_dtype)

if sparse.issparse(X):
return (sparse.csr_matrix((samples, (row_indices, col_indices)),
[len(samples_indices), X.shape[1]]),
[len(samples_indices), X.shape[1]],
dtype=X.dtype),
y_new)
else:
return X_new, y_new
Expand Down Expand Up @@ -301,8 +306,8 @@ def _sample(self, X, y):
if self.kind == 'borderline-1':
# Create synthetic samples for borderline points.
X_new, y_new = self._make_samples(
safe_indexing(X_class, danger_index), class_sample,
X_class, nns, n_samples)
safe_indexing(X_class, danger_index), y.dtype,
class_sample, X_class, nns, n_samples)
if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
else:
Expand All @@ -316,6 +321,7 @@ def _sample(self, X, y):
# only minority
X_new_1, y_new_1 = self._make_samples(
safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
X_class,
nns,
Expand All @@ -327,6 +333,7 @@ def _sample(self, X, y):
# class but all over classes.
X_new_2, y_new_2 = self._make_samples(
safe_indexing(X_class, danger_index),
y.dtype,
class_sample,
safe_indexing(X, np.flatnonzero(y != class_sample)),
nns,
Expand Down Expand Up @@ -490,6 +497,7 @@ def _sample(self, X, y):

X_new_1, y_new_1 = self._make_samples(
safe_indexing(support_vector, np.flatnonzero(danger_bool)),
y.dtype,
class_sample,
X_class,
nns,
Expand All @@ -503,6 +511,7 @@ def _sample(self, X, y):

X_new_2, y_new_2 = self._make_samples(
safe_indexing(support_vector, np.flatnonzero(safety_bool)),
y.dtype,
class_sample,
X_class,
nns,
Expand Down Expand Up @@ -738,8 +747,8 @@ def _sample(self, X, y):

self.nn_k_.fit(X_class)
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
X_new, y_new = self._make_samples(X_class, class_sample, X_class,
nns, n_samples, 1.0)
X_new, y_new = self._make_samples(X_class, y.dtype, class_sample,
X_class, nns, n_samples, 1.0)

if sparse.issparse(X_new):
X_resampled = sparse.vstack([X_resampled, X_new])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ def _generate_sample(self, X, y, centroids, target_class):
X_new = safe_indexing(X, np.squeeze(indices))
else:
if sparse.issparse(X):
X_new = sparse.csr_matrix(centroids)
X_new = sparse.csr_matrix(centroids, dtype=X.dtype)
else:
X_new = centroids
y_new = np.array([target_class] * centroids.shape[0])
y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype)

return X_new, y_new

Expand Down Expand Up @@ -191,4 +191,4 @@ def _sample(self, X, y):
X_resampled = np.vstack(X_resampled)
y_resampled = np.hstack(y_resampled)

return X_resampled, np.array(y_resampled)
return X_resampled, np.array(y_resampled, dtype=y.dtype)
18 changes: 18 additions & 0 deletions imblearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def _yield_sampler_checks(name, Estimator):
yield check_samplers_sparse
yield check_samplers_pandas
yield check_samplers_multiclass_ova
yield check_samplers_preserve_dtype


def _yield_all_checks(name, estimator):
Expand Down Expand Up @@ -333,3 +334,20 @@ def check_samplers_multiclass_ova(name, Sampler):
else:
assert type_of_target(y_res_ova) == type_of_target(y_ova)
assert_allclose(y_res, y_res_ova.argmax(axis=1))


def check_samplers_preserve_dtype(name, Sampler):
X, y = make_classification(
n_samples=1000,
n_classes=3,
n_informative=4,
weights=[0.2, 0.3, 0.5],
random_state=0)
# Cast X and y to not default dtype
X = X.astype(np.float32)
y = y.astype(np.int32)
sampler = Sampler()
set_random_state(sampler)
X_res, y_res = sampler.fit_sample(X, y)
assert X.dtype == X_res.dtype
assert y.dtype == y_res.dtype