Skip to content

Error with Pipeline and imblearn.over_sampling.RandomOverSampler #671

Closed
@skatsaounis

Description

@skatsaounis

Description

When running the method fit() on the following Pipeline the code fails with TypeError. If I remove the RandomOverSampler from the Pipeline then I face no error. Is it a bug or a wrong initialization?

Steps/Code to Reproduce

from imblearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
import numpy as np
import pandas as pd

mnist_train = pd.read_csv("https://www.python-course.eu/data/mnist/mnist_train.csv", header=None).values
mnist_test = pd.read_csv("https://www.python-course.eu/data/mnist/mnist_test.csv", header=None).values
mnist = np.concatenate((mnist_train, mnist_test), axis=0)

xfull = mnist[:, 1:]
yfull = mnist[:, :1]
sdata, starget = shuffle(xfull, yfull, random_state=36)
samples = 1000
X = sdata[0:samples-1,:]
y = starget[0:samples-1]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=20176)
selector = VarianceThreshold()
scaler = StandardScaler()
ros = RandomOverSampler()
pca = PCA()
clf = KNeighborsClassifier(n_jobs=-1)
pipe = Pipeline(
    steps=[
           ('selector', selector), ('scaler', scaler), ('sampler', ros),
           ('pca', pca), ('kNN', clf)])
# X_train.shape is (669, 784)
# Y_train.shape is (669, 1)
pipe.fit(X_train, y_train)

Expected Results

No error is thrown.

Actual Results

TypeError                                 Traceback (most recent call last)
<ipython-input-51-da2a32d7cb52> in <module>()
----> pipe.fit(X_train, y_train)

10 frames
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    737             refit_start_time = time.time()
    738             if y is not None:
--> 739                 self.best_estimator_.fit(X, y, **fit_params)
    740             else:
    741                 self.best_estimator_.fit(X, **fit_params)

/usr/local/lib/python3.6/dist-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
    285 
    286         """
--> 287         Xt, yt, fit_params = self._fit(X, y, **fit_params)
    288         with _print_elapsed_time('Pipeline',
    289                                  self._log_message(len(self.steps) - 1)):

/usr/local/lib/python3.6/dist-packages/imblearn/pipeline.py in _fit(self, X, y, **fit_params)
    247                     message_clsname='Pipeline',
    248                     message=self._log_message(step_idx),
--> 249                     **fit_params_steps[name]
    250                 )
    251             # Replace the transformer of the step with the fitted

/usr/local/lib/python3.6/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    566 
    567     def __call__(self, *args, **kwargs):
--> 568         return self._cached_call(args, kwargs)[0]
    569 
    570     def __getstate__(self):

/usr/local/lib/python3.6/dist-packages/joblib/memory.py in _cached_call(self, args, kwargs, shelving)
    532 
    533         if must_call:
--> 534             out, metadata = self.call(*args, **kwargs)
    535             if self.mmap_mode is not None:
    536                 # Memmap the output at the first call to be consistent with

/usr/local/lib/python3.6/dist-packages/joblib/memory.py in call(self, *args, **kwargs)
    732         if self._verbose > 0:
    733             print(format_call(self.func, args, kwargs))
--> 734         output = self.func(*args, **kwargs)
    735         self.store_backend.dump_item(
    736             [func_id, args_id], output, verbose=self._verbose)

/usr/local/lib/python3.6/dist-packages/imblearn/pipeline.py in _fit_resample_one(sampler, X, y, message_clsname, message, **fit_params)
    412                       **fit_params):
    413     with _print_elapsed_time(message_clsname, message):
--> 414         X_res, y_res = sampler.fit_resample(X, y, **fit_params)
    415 
    416         return X_res, y_res, sampler

/usr/local/lib/python3.6/dist-packages/imblearn/base.py in fit_resample(self, X, y)
     79         )
     80 
---> 81         output = self._fit_resample(X, y)
     82 
     83         if self._X_columns is not None or self._y_name is not None:

/usr/local/lib/python3.6/dist-packages/imblearn/over_sampling/_random_over_sampler.py in _fit_resample(self, X, y)
    102     def _fit_resample(self, X, y):
    103         random_state = check_random_state(self.random_state)
--> 104         target_stats = Counter(y)
    105 
    106         sample_indices = range(X.shape[0])

/usr/lib/python3.6/collections/__init__.py in __init__(*args, **kwds)
    533             raise TypeError('expected at most 1 arguments, got %d' % len(args))
    534         super(Counter, self).__init__()
--> 535         self.update(*args, **kwds)
    536 
    537     def __missing__(self, key):

/usr/lib/python3.6/collections/__init__.py in update(*args, **kwds)
    620                     super(Counter, self).update(iterable) # fast path when counter is empty
    621             else:
--> 622                 _count_elements(self, iterable)
    623         if kwds:
    624             self.update(kwds)

TypeError: unhashable type: 'numpy.ndarray'

Versions

Linux-4.14.137+-x86_64-with-Ubuntu-18.04-bionic
Python 3.6.9 (default, Nov 7 2019, 10:44:02)
[GCC 8.3.0]
NumPy 1.18.0
SciPy 1.3.3
Scikit-Learn 0.22.1
Imbalanced-Learn 0.6.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type: BugIndicates an unexpected problem or unintended behavior

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions