Closed
Description
Description
When running the method fit()
on the following Pipeline the code fails with TypeError
. If I remove the RandomOverSampler
from the Pipeline then I face no error. Is it a bug or a wrong initialization?
Steps/Code to Reproduce
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
mnist_train = pd.read_csv("https://www.python-course.eu/data/mnist/mnist_train.csv", header=None).values
mnist_test = pd.read_csv("https://www.python-course.eu/data/mnist/mnist_test.csv", header=None).values
mnist = np.concatenate((mnist_train, mnist_test), axis=0)
xfull = mnist[:, 1:]
yfull = mnist[:, :1]
sdata, starget = shuffle(xfull, yfull, random_state=36)
samples = 1000
X = sdata[0:samples-1,:]
y = starget[0:samples-1]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=20176)
selector = VarianceThreshold()
scaler = StandardScaler()
ros = RandomOverSampler()
pca = PCA()
clf = KNeighborsClassifier(n_jobs=-1)
pipe = Pipeline(
steps=[
('selector', selector), ('scaler', scaler), ('sampler', ros),
('pca', pca), ('kNN', clf)])
# X_train.shape is (669, 784)
# Y_train.shape is (669, 1)
pipe.fit(X_train, y_train)
Expected Results
No error is thrown.
Actual Results
TypeError Traceback (most recent call last)
<ipython-input-51-da2a32d7cb52> in <module>()
----> pipe.fit(X_train, y_train)
10 frames
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
737 refit_start_time = time.time()
738 if y is not None:
--> 739 self.best_estimator_.fit(X, y, **fit_params)
740 else:
741 self.best_estimator_.fit(X, **fit_params)
/usr/local/lib/python3.6/dist-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
285
286 """
--> 287 Xt, yt, fit_params = self._fit(X, y, **fit_params)
288 with _print_elapsed_time('Pipeline',
289 self._log_message(len(self.steps) - 1)):
/usr/local/lib/python3.6/dist-packages/imblearn/pipeline.py in _fit(self, X, y, **fit_params)
247 message_clsname='Pipeline',
248 message=self._log_message(step_idx),
--> 249 **fit_params_steps[name]
250 )
251 # Replace the transformer of the step with the fitted
/usr/local/lib/python3.6/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
566
567 def __call__(self, *args, **kwargs):
--> 568 return self._cached_call(args, kwargs)[0]
569
570 def __getstate__(self):
/usr/local/lib/python3.6/dist-packages/joblib/memory.py in _cached_call(self, args, kwargs, shelving)
532
533 if must_call:
--> 534 out, metadata = self.call(*args, **kwargs)
535 if self.mmap_mode is not None:
536 # Memmap the output at the first call to be consistent with
/usr/local/lib/python3.6/dist-packages/joblib/memory.py in call(self, *args, **kwargs)
732 if self._verbose > 0:
733 print(format_call(self.func, args, kwargs))
--> 734 output = self.func(*args, **kwargs)
735 self.store_backend.dump_item(
736 [func_id, args_id], output, verbose=self._verbose)
/usr/local/lib/python3.6/dist-packages/imblearn/pipeline.py in _fit_resample_one(sampler, X, y, message_clsname, message, **fit_params)
412 **fit_params):
413 with _print_elapsed_time(message_clsname, message):
--> 414 X_res, y_res = sampler.fit_resample(X, y, **fit_params)
415
416 return X_res, y_res, sampler
/usr/local/lib/python3.6/dist-packages/imblearn/base.py in fit_resample(self, X, y)
79 )
80
---> 81 output = self._fit_resample(X, y)
82
83 if self._X_columns is not None or self._y_name is not None:
/usr/local/lib/python3.6/dist-packages/imblearn/over_sampling/_random_over_sampler.py in _fit_resample(self, X, y)
102 def _fit_resample(self, X, y):
103 random_state = check_random_state(self.random_state)
--> 104 target_stats = Counter(y)
105
106 sample_indices = range(X.shape[0])
/usr/lib/python3.6/collections/__init__.py in __init__(*args, **kwds)
533 raise TypeError('expected at most 1 arguments, got %d' % len(args))
534 super(Counter, self).__init__()
--> 535 self.update(*args, **kwds)
536
537 def __missing__(self, key):
/usr/lib/python3.6/collections/__init__.py in update(*args, **kwds)
620 super(Counter, self).update(iterable) # fast path when counter is empty
621 else:
--> 622 _count_elements(self, iterable)
623 if kwds:
624 self.update(kwds)
TypeError: unhashable type: 'numpy.ndarray'
Versions
Linux-4.14.137+-x86_64-with-Ubuntu-18.04-bionic
Python 3.6.9 (default, Nov 7 2019, 10:44:02)
[GCC 8.3.0]
NumPy 1.18.0
SciPy 1.3.3
Scikit-Learn 0.22.1
Imbalanced-Learn 0.6.1