From 6e37a6081a474cb9b0785c75a1badfb68b128816 Mon Sep 17 00:00:00 2001 From: Shihab Shahriar Khan Date: Sat, 7 Sep 2019 19:28:20 +0600 Subject: [PATCH] FIX reproducibility and parallelization of InstanceHardnessThreshold --- .../_instance_hardness_threshold.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 5991a2785..6795a381d 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -12,7 +12,7 @@ from sklearn.base import ClassifierMixin, clone from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import StratifiedKFold,cross_val_predict from sklearn.utils import safe_indexing from ..base import BaseUnderSampler @@ -126,6 +126,7 @@ def _validate_estimator(self): isinstance(self.estimator, ClassifierMixin) and hasattr(self.estimator, 'predict_proba')): self.estimator_ = clone(self.estimator) + self.estimator_.set_params(n_jobs=1,random_state=self.random_state) elif self.estimator is None: self.estimator_ = RandomForestClassifier( n_estimators=100, random_state=self.random_state, @@ -143,19 +144,10 @@ def _fit_resample(self, X, y): target_stats = Counter(y) skf = StratifiedKFold( n_splits=self.cv, shuffle=False, - random_state=self.random_state).split(X, y) - probabilities = np.zeros(y.shape[0], dtype=float) - - for train_index, test_index in skf: - X_train = safe_indexing(X, train_index) - X_test = safe_indexing(X, test_index) - y_train = safe_indexing(y, train_index) - y_test = safe_indexing(y, test_index) - - self.estimator_.fit(X_train, y_train) - - probs = self.estimator_.predict_proba(X_test) - probabilities[test_index] = probs[range(len(y_test)), y_test] + random_state=self.random_state) + probabilities = cross_val_predict(self.estimator_, X, y, cv=skf, + n_jobs=self.n_jobs, method='predict_proba') + probabilities = probabilities[range(len(y)), y] idx_under = np.empty((0, ), dtype=int)