@@ -100,39 +100,17 @@ def _make_samples(
100
100
samples_indices = random_state .randint (
101
101
low = 0 , high = len (nn_num .flatten ()), size = n_samples
102
102
)
103
- steps = step_size * random_state .uniform (size = n_samples )
103
+
104
+ # np.newaxis for backwards compatability with random_state
105
+ steps = step_size * random_state .uniform (size = n_samples )[:, np .newaxis ]
104
106
rows = np .floor_divide (samples_indices , nn_num .shape [1 ])
105
107
cols = np .mod (samples_indices , nn_num .shape [1 ])
106
108
107
- y_new = np .array ([y_type ] * len (samples_indices ), dtype = y_dtype )
109
+ X_new = self ._generate_samples (X , nn_data , nn_num , rows , cols , steps )
110
+ y_new = np .full (n_samples , fill_value = y_type , dtype = y_dtype )
111
+ return X_new , y_new
108
112
109
- if sparse .issparse (X ):
110
- row_indices , col_indices , samples = [], [], []
111
- for i , (row , col , step ) in enumerate (zip (rows , cols , steps )):
112
- if X [row ].nnz :
113
- sample = self ._generate_sample (
114
- X , nn_data , nn_num , row , col , step
115
- )
116
- row_indices += [i ] * len (sample .indices )
117
- col_indices += sample .indices .tolist ()
118
- samples += sample .data .tolist ()
119
- return (
120
- sparse .csr_matrix (
121
- (samples , (row_indices , col_indices )),
122
- [len (samples_indices ), X .shape [1 ]],
123
- dtype = X .dtype ,
124
- ),
125
- y_new ,
126
- )
127
- else :
128
- X_new = np .zeros ((n_samples , X .shape [1 ]), dtype = X .dtype )
129
- for i , (row , col , step ) in enumerate (zip (rows , cols , steps )):
130
- X_new [i ] = self ._generate_sample (
131
- X , nn_data , nn_num , row , col , step
132
- )
133
- return X_new , y_new
134
-
135
- def _generate_sample (self , X , nn_data , nn_num , row , col , step ):
113
+ def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps ):
136
114
r"""Generate a synthetic sample.
137
115
138
116
The rule for the generation is:
@@ -156,23 +134,32 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step):
156
134
nn_num : ndarray of shape (n_samples_all, k_nearest_neighbours)
157
135
The nearest neighbours of each sample in `nn_data`.
158
136
159
- row : int
160
- Index pointing at feature vector in X which will be used
161
- as a base for creating new sample .
137
+ rows : ndarray of shape (n_samples,), dtype= int
138
+ Indices pointing at feature vector in X which will be used
139
+ as a base for creating new samples .
162
140
163
- col : int
164
- Index pointing at which nearest neighbor of base feature vector
165
- will be used when creating new sample .
141
+ cols : ndarray of shape (n_samples,), dtype= int
142
+ Indices pointing at which nearest neighbor of base feature vector
143
+ will be used when creating new samples .
166
144
167
- step : float
168
- Step size for new sample .
145
+ steps : ndarray of shape (n_samples,), dtype= float
146
+ Step sizes for new samples .
169
147
170
148
Returns
171
149
-------
172
- X_new : {ndarray, sparse matrix} of shape (n_features, )
173
- Single synthetically generated sample .
150
+ X_new : {ndarray, sparse matrix} of shape (n_samples, n_features )
151
+ Synthetically generated samples .
174
152
"""
175
- return X [row ] - step * (X [row ] - nn_data [nn_num [row , col ]])
153
+ diffs = nn_data [nn_num [rows , cols ]] - X [rows ]
154
+
155
+ if sparse .issparse (X ):
156
+ sparse_func = type (X ).__name__
157
+ steps = getattr (sparse , sparse_func )(steps )
158
+ X_new = X [rows ] + steps .multiply (diffs )
159
+ else :
160
+ X_new = X [rows ] + steps * diffs
161
+
162
+ return X_new .astype (X .dtype )
176
163
177
164
def _in_danger_noise (
178
165
self , nn_estimator , samples , target_class , y , kind = "danger"
@@ -727,8 +714,8 @@ def __init__(
727
714
def _fit_resample (self , X , y ):
728
715
self ._validate_estimator ()
729
716
730
- X_resampled = X .copy ()
731
- y_resampled = y .copy ()
717
+ X_resampled = [ X .copy ()]
718
+ y_resampled = [ y .copy ()]
732
719
733
720
for class_sample , n_samples in self .sampling_strategy_ .items ():
734
721
if n_samples == 0 :
@@ -741,14 +728,15 @@ def _fit_resample(self, X, y):
741
728
X_new , y_new = self ._make_samples (
742
729
X_class , y .dtype , class_sample , X_class , nns , n_samples , 1.0
743
730
)
731
+ X_resampled .append (X_new )
732
+ y_resampled .append (y_new )
733
+
734
+ if sparse .issparse (X_new ):
735
+ X_resampled = sparse .vstack (X_resampled , format = X .format )
736
+ else :
737
+ X_resampled = np .vstack (X_resampled )
738
+ y_resampled = np .hstack (y_resampled )
744
739
745
- if sparse .issparse (X_new ):
746
- X_resampled = sparse .vstack ([X_resampled , X_new ])
747
- sparse_func = "tocsc" if X .format == "csc" else "tocsr"
748
- X_resampled = getattr (X_resampled , sparse_func )()
749
- else :
750
- X_resampled = np .vstack ((X_resampled , X_new ))
751
- y_resampled = np .hstack ((y_resampled , y_new ))
752
740
753
741
return X_resampled , y_resampled
754
742
@@ -1015,7 +1003,7 @@ def _fit_resample(self, X, y):
1015
1003
1016
1004
return X_resampled , y_resampled
1017
1005
1018
- def _generate_sample (self , X , nn_data , nn_num , row , col , step ):
1006
+ def _generate_samples (self , X , nn_data , nn_num , rows , cols , steps ):
1019
1007
"""Generate a synthetic sample with an additional steps for the
1020
1008
categorical features.
1021
1009
@@ -1024,35 +1012,34 @@ def _generate_sample(self, X, nn_data, nn_num, row, col, step):
1024
1012
of the majority class.
1025
1013
"""
1026
1014
rng = check_random_state (self .random_state )
1027
- sample = super ()._generate_sample (X , nn_data , nn_num , row , col , step )
1028
- # To avoid conversion and since there is only few samples used, we
1029
- # convert those samples to dense array.
1030
- sample = (
1031
- sample .toarray ().squeeze () if sparse .issparse (sample ) else sample
1032
- )
1033
- all_neighbors = nn_data [nn_num [row ]]
1034
- all_neighbors = (
1035
- all_neighbors .toarray ()
1036
- if sparse .issparse (all_neighbors )
1037
- else all_neighbors
1015
+ X_new = super ()._generate_samples (
1016
+ X , nn_data , nn_num , rows , cols , steps
1038
1017
)
1018
+ # change in sparsity structure more efficient with LIL than CSR
1019
+ X_new = (X_new .tolil () if sparse .issparse (X_new ) else X_new )
1020
+
1021
+ # convert to dense array since scipy.sparse doesn't handle 3D
1022
+ nn_data = (nn_data .toarray () if sparse .issparse (nn_data ) else nn_data )
1023
+ all_neighbors = nn_data [nn_num [rows ]]
1039
1024
1040
1025
categories_size = [self .continuous_features_ .size ] + [
1041
1026
cat .size for cat in self .ohe_ .categories_
1042
1027
]
1043
1028
1044
- for start_idx , end_idx in zip (
1045
- np .cumsum (categories_size )[:- 1 ], np .cumsum (categories_size )[1 :]
1046
- ):
1047
- col_max = all_neighbors [:, start_idx :end_idx ].sum (axis = 0 )
1029
+ for start_idx , end_idx in zip (np .cumsum (categories_size )[:- 1 ],
1030
+ np .cumsum (categories_size )[1 :]):
1031
+ col_maxs = all_neighbors [:, :, start_idx :end_idx ].sum (axis = 1 )
1048
1032
# tie breaking argmax
1049
- col_sel = rng .choice (
1050
- np .flatnonzero (np .isclose (col_max , col_max .max ()))
1051
- )
1052
- sample [start_idx :end_idx ] = 0
1053
- sample [start_idx + col_sel ] = 1
1033
+ is_max = np .isclose (col_maxs , col_maxs .max (axis = 1 , keepdims = True ))
1034
+ max_idxs = rng .permutation (np .argwhere (is_max ))
1035
+ xs , idx_sels = np .unique (max_idxs [:, 0 ], return_index = True )
1036
+ col_sels = max_idxs [idx_sels , 1 ]
1037
+
1038
+ ys = start_idx + col_sels
1039
+ X_new [:, start_idx :end_idx ] = 0
1040
+ X_new [xs , ys ] = 1
1054
1041
1055
- return sparse . csr_matrix ( sample ) if sparse . issparse ( X ) else sample
1042
+ return X_new
1056
1043
1057
1044
1058
1045
@Substitution (
0 commit comments