diff --git a/.travis.yml b/.travis.yml index 92882337b..2d06929da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ matrix: - env: DISTRIB="conda" PYTHON_VERSION="2.7" NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20rc" - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20rc" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.20rc" - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20rc" - env: DISTRIB="conda" PYTHON_VERSION="3.7" diff --git a/appveyor.yml b/appveyor.yml index 06a12a322..b7e6298b5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -42,7 +42,7 @@ install: - activate testenv - conda install scipy numpy -y -q - pip install --pre scikit-learn - - "conda install %OPTIONAL_DEP% -y -q" + - conda install %OPTIONAL_DEP% -y -q - conda install pytest pytest-cov -y -q - pip install codecov - pip install . diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 4b4ff915e..3f30c27d4 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -88,11 +88,12 @@ conda update --yes --quiet conda # Configure the conda environment and put it in the path using the # provided versions -conda create -n $CONDA_ENV_NAME --yes --quiet python=3 +conda create -n $CONDA_ENV_NAME --yes --quiet python=3.6 source activate $CONDA_ENV_NAME -conda install --yes pip numpy scipy scikit-learn pillow matplotlib sphinx \ +conda install --yes pip numpy scipy pillow matplotlib sphinx \ sphinx_rtd_theme numpydoc pandas keras +pip install --pre scikit-learn pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git # Build and install imbalanced-learn in dev mode diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 6f89bc10f..3d931455e 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -40,9 +40,9 @@ if [[ "$DISTRIB" == "conda" ]]; then source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION - if [[ $PYTHON_VERSION == "3.7" ]]; then - conda install --yes pandas - conda install --yes -c conda-forge keras + if [[ $PYTHON_VERSION == "3.6" ]]; then + # Tensorflow is not available in Python 3.7 yet. + conda install --yes pandas keras tensorflow KERAS_BACKEND=tensorflow python -c "import keras.backend" sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; diff --git a/doc/ensemble.rst b/doc/ensemble.rst index 814afb2c1..b714ad007 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -32,11 +32,11 @@ under-sampling the original set:: >>> print(sorted(Counter(y).items())) [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.ensemble import EasyEnsemble - >>> ee = EasyEnsemble(random_state=0, n_subsets=10) - >>> X_resampled, y_resampled = ee.fit_resample(X, y) - >>> print(X_resampled.shape) + >>> ee = EasyEnsemble(random_state=0, n_subsets=10) # doctest: +SKIP + >>> X_resampled, y_resampled = ee.fit_resample(X, y) # doctest: +SKIP + >>> print(X_resampled.shape) # doctest: +SKIP (10, 192, 2) - >>> print(sorted(Counter(y_resampled[0]).items())) + >>> print(sorted(Counter(y_resampled[0]).items())) # doctest: +SKIP [(0, 64), (1, 64), (2, 64)] :class:`EasyEnsemble` has two important parameters: (i) ``n_subsets`` will be @@ -53,7 +53,9 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with >>> from imblearn.ensemble import BalanceCascade >>> from sklearn.linear_model import LogisticRegression >>> bc = BalanceCascade(random_state=0, - ... estimator=LogisticRegression(random_state=0), + ... estimator=LogisticRegression(solver='lbfgs', + ... multi_class='auto', + ... random_state=0), ... n_max_subset=4) >>> X_resampled, y_resampled = bc.fit_resample(X, y) >>> print(X_resampled.shape) diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index c621d40cb..50f7a474e 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -340,7 +340,7 @@ used as:: >>> oss = OneSidedSelection(random_state=0) >>> X_resampled, y_resampled = oss.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) - [(0, 64), (1, 174), (2, 4403)] + [(0, 64), (1, 174), (2, 4404)] Our implementation offer to set the number of seeds to put in the set :math:`C` originally by setting the parameter ``n_seeds_S``. @@ -379,7 +379,8 @@ removed. The class can be used as:: >>> from sklearn.linear_model import LogisticRegression >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> iht = InstanceHardnessThreshold(random_state=0, - ... estimator=LogisticRegression()) + ... estimator=LogisticRegression( + ... solver='lbfgs', multi_class='auto')) >>> X_resampled, y_resampled = iht.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index c4e2901b0..db7ad156c 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -97,6 +97,9 @@ Maintenance - Upgrade requirements to scikit-learn 0.20. :issue:`379` by :user:`Guillaume Lemaitre `. +- Catch deprecation warning in testing. + :issue:`441` by :user:`Guillaume Lemaitre `. + Documentation ............. diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py index 686cf449d..6bdd749a4 100644 --- a/examples/plot_outlier_rejections.py +++ b/examples/plot_outlier_rejections.py @@ -37,6 +37,7 @@ def plot_scatter(X, y, title): plt.legend() plt.title(title) + ############################################################################## # Toy data generation ############################################################################## @@ -82,11 +83,13 @@ def plot_scatter(X, y, title): # :class:`imblearn.FunctionSampler` will be called when using the method # ``fit_resample``. + def outlier_rejection(X, y): """This will be our function used to resample our dataset.""" model = IsolationForest(max_samples=100, contamination=0.4, - random_state=rng) + random_state=rng, + behaviour='new') model.fit(X) y_pred = model.predict(X) return X[y_pred == 1], y[y_pred == 1] @@ -105,11 +108,12 @@ def outlier_rejection(X, y): # affected during the prediction. pipe = make_pipeline(FunctionSampler(func=outlier_rejection), - LogisticRegression(random_state=rng)) + LogisticRegression(solver='lbfgs', multi_class='auto', + random_state=rng)) y_pred = pipe.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) -clf = LogisticRegression(random_state=rng) +clf = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=rng) y_pred = clf.fit(X_train, y_train).predict(X_test) print(classification_report(y_test, y_pred)) diff --git a/examples/under-sampling/plot_comparison_under_sampling.py b/examples/under-sampling/plot_comparison_under_sampling.py index a175d6193..c9977c92a 100644 --- a/examples/under-sampling/plot_comparison_under_sampling.py +++ b/examples/under-sampling/plot_comparison_under_sampling.py @@ -235,8 +235,9 @@ def plot_decision_function(X, y, clf, ax): clf = LinearSVC().fit(X, y) plot_decision_function(X, y, clf, ax1) ax1.set_title('Linear SVC with y={}'.format(Counter(y))) -sampler = InstanceHardnessThreshold(random_state=0, - estimator=LogisticRegression()) +sampler = InstanceHardnessThreshold( + random_state=0, estimator=LogisticRegression(solver='lbfgs', + multi_class='auto')) clf = make_pipeline(sampler, LinearSVC()) clf.fit(X, y) plot_decision_function(X, y, clf, ax2) diff --git a/examples/under-sampling/plot_instance_hardness_threshold.py b/examples/under-sampling/plot_instance_hardness_threshold.py index 9d2456b99..1f821b58f 100644 --- a/examples/under-sampling/plot_instance_hardness_threshold.py +++ b/examples/under-sampling/plot_instance_hardness_threshold.py @@ -60,7 +60,9 @@ def plot_resampling(ax, X, y, title): c0, c1 = plot_resampling(ax, X_vis, y, 'Original set') else: iht = InstanceHardnessThreshold(sampling_strategy=sampling_strategy, - estimator=LogisticRegression(), + estimator=LogisticRegression( + solver='lbfgs', + multi_class='auto'), return_indices=True) X_res, y_res, idx_res = iht.fit_resample(X, y) X_res_vis = pca.transform(X_res) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index 4cd921868..39a16f10e 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -48,8 +48,7 @@ def test_sample_regular(): def test_sample_regular_pass_smote_enn(): smote = SMOTEENN( smote=SMOTE(sampling_strategy='auto', random_state=RND_SEED), - enn=EditedNearestNeighbours( - sampling_strategy='all', random_state=RND_SEED), + enn=EditedNearestNeighbours(sampling_strategy='all'), random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) @@ -77,8 +76,7 @@ def test_sample_regular_half(): def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) - enn = EditedNearestNeighbours( - random_state=RND_SEED, sampling_strategy='all') + enn = EditedNearestNeighbours(sampling_strategy='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176], [ diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py index 06a519c51..2221bc463 100644 --- a/imblearn/combine/tests/test_smote_tomek.py +++ b/imblearn/combine/tests/test_smote_tomek.py @@ -70,7 +70,7 @@ def test_sample_regular_half(): def test_validate_estimator_init(): smote = SMOTE(random_state=RND_SEED) - tomek = TomekLinks(random_state=RND_SEED, sampling_strategy='all') + tomek = TomekLinks(sampling_strategy='all') smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_resample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [ diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py index 3ba99cd19..6776a6daf 100644 --- a/imblearn/datasets/tests/test_imbalance.py +++ b/imblearn/datasets/tests/test_imbalance.py @@ -7,6 +7,7 @@ from collections import Counter +import pytest import numpy as np from pytest import raises @@ -53,6 +54,7 @@ def test_make_imbalance_dict(): assert Counter(y_) == {0: 10, 1: 20, 2: 50} +@pytest.mark.filterwarnings("ignore:'ratio' has been deprecated in 0.4") def test_make_imbalance_ratio(): # check that using 'ratio' is working sampling_strategy = {0: 10, 1: 20, 2: 30} diff --git a/imblearn/ensemble/_balance_cascade.py b/imblearn/ensemble/_balance_cascade.py index 81b3a7f80..3539e972f 100644 --- a/imblearn/ensemble/_balance_cascade.py +++ b/imblearn/ensemble/_balance_cascade.py @@ -179,7 +179,7 @@ def _fit_resample(self, X, y): # fit and predict using cross validation X_subset = safe_indexing(X, subset_indices) y_subset = safe_indexing(y, subset_indices) - pred = cross_val_predict(self.estimator_, X_subset, y_subset) + pred = cross_val_predict(self.estimator_, X_subset, y_subset, cv=3) # extract the prediction about the targeted classes only pred_target = pred[:index_under_sample.size] index_classified = index_under_sample[pred_target == safe_indexing( diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py index 1a343d05c..959193484 100644 --- a/imblearn/ensemble/_easy_ensemble.py +++ b/imblearn/ensemble/_easy_ensemble.py @@ -93,9 +93,9 @@ class EasyEnsemble(BaseEnsembleSampler): ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) - >>> ee = EasyEnsemble(random_state=42) - >>> X_res, y_res = ee.fit_resample(X, y) - >>> print('Resampled dataset shape %s' % Counter(y_res[0])) + >>> ee = EasyEnsemble(random_state=42) # doctest: +SKIP + >>> X_res, y_res = ee.fit_resample(X, y) # doctest: +SKIP + >>> print('Resampled dataset shape %s' % Counter(y_res[0])) # doctest: +SKIP Resampled dataset shape Counter({{0: 100, 1: 100}}) """ diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py index 2a77cd276..da7d9b793 100644 --- a/imblearn/ensemble/tests/test_bagging.py +++ b/imblearn/ensemble/tests/test_bagging.py @@ -47,10 +47,10 @@ def test_balanced_bagging_classifier(): for base_estimator in [ None, DummyClassifier(), - Perceptron(), + Perceptron(max_iter=1000, tol=1e-3), DecisionTreeClassifier(), KNeighborsClassifier(), - SVC() + SVC(gamma='scale') ]: for params in grid: BalancedBaggingClassifier( @@ -155,8 +155,10 @@ def test_probability(): # Degenerate case, where some classes are missing ensemble = BalancedBaggingClassifier( - base_estimator=LogisticRegression(), random_state=0, - max_samples=5).fit(X_train, y_train) + base_estimator=LogisticRegression(solver='lbfgs', + multi_class='auto'), + random_state=0, max_samples=5) + ensemble.fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), @@ -179,7 +181,7 @@ def test_oob_score_classification(): random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - for base_estimator in [DecisionTreeClassifier(), SVC()]: + for base_estimator in [DecisionTreeClassifier(), SVC(gamma='scale')]: clf = BalancedBaggingClassifier( base_estimator=base_estimator, n_estimators=100, @@ -282,8 +284,8 @@ def test_gridsearch(): parameters = {'n_estimators': (1, 2), 'base_estimator__C': (1, 2)} GridSearchCV( - BalancedBaggingClassifier(SVC()), parameters, scoring="roc_auc").fit( - X, y) + BalancedBaggingClassifier(SVC(gamma='scale')), parameters, cv=3, + scoring="roc_auc").fit(X, y) def test_base_estimator(): @@ -311,7 +313,8 @@ def test_base_estimator(): DecisionTreeClassifier) ensemble = BalancedBaggingClassifier( - Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train) + Perceptron(max_iter=1000, tol=1e-3), n_jobs=3, random_state=0).fit( + X_train, y_train) assert isinstance(ensemble.base_estimator_.steps[-1][1], Perceptron) @@ -445,7 +448,8 @@ def test_estimators_samples(): # remap the y outside of the BalancedBaggingclassifier # _, y = np.unique(y, return_inverse=True) - bagging = BalancedBaggingClassifier(LogisticRegression(), + bagging = BalancedBaggingClassifier(LogisticRegression(solver='lbfgs', + multi_class='auto'), max_samples=0.5, max_features=0.5, random_state=1, bootstrap=False) diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py index 3ffc71672..334c6cd20 100644 --- a/imblearn/ensemble/tests/test_balance_cascade.py +++ b/imblearn/ensemble/tests/test_balance_cascade.py @@ -118,7 +118,7 @@ def test_fit_resample_auto_early_stop(): def test_give_classifier_obj(): sampling_strategy = 'auto' - estimator = RandomForestClassifier(random_state=RND_SEED) + estimator = RandomForestClassifier(n_estimators=10, random_state=RND_SEED) bc = BalanceCascade( sampling_strategy=sampling_strategy, random_state=RND_SEED, diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 06599e473..158ac174e 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -292,5 +292,5 @@ def test_easy_ensemble_classifier_grid_search(): 'base_estimator__n_estimators': [3, 4]} grid_search = GridSearchCV( EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()), - parameters) + parameters, cv=5, iid=False) grid_search.fit(X, y) diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py index a92e8ea9d..35e44eb61 100644 --- a/imblearn/keras/_generator.py +++ b/imblearn/keras/_generator.py @@ -24,6 +24,10 @@ from ..utils._docstring import _random_state_docstring from ..tensorflow import balanced_batch_generator as tf_bbg +DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours', + 'RepeatedEditedNearestNeighbours', 'AllKNN', + 'NeighbourhoodCleaningRule', 'TomekLinks') + class BalancedBatchGenerator(ParentClass): """Create balanced batches when training a keras model. @@ -122,7 +126,9 @@ def _sample(self): "which has an attribute 'return_indices'.") self.sampler_ = clone(self.sampler) self.sampler_.set_params(return_indices=True) - set_random_state(self.sampler_, random_state) + # FIXME: Remove in 0.6 + if self.sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE: + set_random_state(self.sampler_, random_state) _, _, self.indices_ = self.sampler_.fit_resample(self.X, self.y) # shuffle the indices since the sampler are packing them by class diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py index cbab74864..4565652f9 100644 --- a/imblearn/keras/tests/test_generator.py +++ b/imblearn/keras/tests/test_generator.py @@ -13,6 +13,7 @@ from imblearn.datasets import make_imbalance from imblearn.under_sampling import ClusterCentroids from imblearn.under_sampling import NearMiss +from imblearn.over_sampling import RandomOverSampler from imblearn.keras import BalancedBatchGenerator from imblearn.keras import balanced_batch_generator @@ -38,6 +39,7 @@ def test_balanced_batch_generator_class_no_return_indices(): @pytest.mark.parametrize( "sampler, sample_weight", [(None, None), + (RandomOverSampler(), None), (NearMiss(), None), (None, np.random.uniform(size=(y.shape[0])))] ) @@ -75,6 +77,7 @@ def test_balanced_batch_generator_function_no_return_indices(): @pytest.mark.parametrize( "sampler, sample_weight", [(None, None), + (RandomOverSampler(), None), (NearMiss(), None), (None, np.random.uniform(size=(y.shape[0])))] ) diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 3ab977c29..f55b4d2a9 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -632,11 +632,13 @@ class is unrecognized by the classifier, G-mean resolves to zero. To tp_sum = tp_sum[indices] true_sum = true_sum[indices] - recall = _prf_divide(tp_sum, true_sum, "recall", "true", None, - "recall") + with np.errstate(divide='ignore', invalid='ignore'): + recall = _prf_divide(tp_sum, true_sum, "recall", "true", None, + "recall") recall[recall == 0] = correction - gmean = sp.stats.gmean(recall) + with np.errstate(divide='ignore', invalid='ignore'): + gmean = sp.stats.gmean(recall) # old version of scipy return MaskedConstant instead of 0.0 if isinstance(gmean, np.ma.core.MaskedConstant): return 0.0 diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 4dc583f8a..ae2585e13 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -10,6 +10,7 @@ import numpy as np +import pytest from pytest import approx, raises from sklearn import datasets @@ -20,7 +21,6 @@ from sklearn.utils.validation import check_random_state from sklearn.utils.testing import assert_allclose, assert_array_equal from sklearn.utils.testing import assert_no_warnings -from sklearn.utils.testing import ignore_warnings from sklearn.metrics import accuracy_score, average_precision_score from sklearn.metrics import brier_score_loss, cohen_kappa_score from sklearn.metrics import jaccard_similarity_score, precision_score @@ -113,6 +113,7 @@ def test_sensitivity_specificity_score_binary(): assert_allclose(spe, 0.88, rtol=R_TOL) +@pytest.mark.filterwarnings("ignore:Specificity is ill-defined") def test_sensitivity_specificity_f_binary_single_class(): # Such a case may occur with non-stratified cross-validation assert sensitivity_score([1, 1], [1, 1]) == 1. @@ -122,7 +123,6 @@ def test_sensitivity_specificity_f_binary_single_class(): assert specificity_score([-1, -1], [-1, -1]) == 0. -@ignore_warnings def test_sensitivity_specificity_extra_labels(): y_true = [1, 3, 3, 2] y_pred = [1, 1, 3, 2] @@ -148,7 +148,6 @@ def test_sensitivity_specificity_extra_labels(): assert_allclose(np.mean([1., 0.67, 1., 1., 1.]), actual, rtol=R_TOL) -@ignore_warnings def test_sensitivity_specificity_ignored_labels(): y_true = [1, 1, 2, 3] y_pred = [1, 3, 3, 3] @@ -181,7 +180,6 @@ def test_sensitivity_specificity_error_multilabels(): sensitivity_score(y_true_bin, y_pred_bin) -@ignore_warnings def test_sensitivity_specificity_support_errors(): y_true, y_pred, _ = make_prediction(binary=True) @@ -211,6 +209,7 @@ def test_geometric_mean_support_binary(): assert_allclose(geo_mean, 0.77, rtol=R_TOL) +@pytest.mark.filterwarnings("ignore:Recall is ill-defined") def test_geometric_mean_multiclass(): y_true = [0, 0, 1, 1] y_pred = [0, 0, 1, 1] diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index d244bf6e0..3e9dd3e20 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -3,27 +3,23 @@ # Christos Aridas # License: MIT -import sklearn +import pytest from sklearn.datasets import make_blobs from sklearn.metrics import make_scorer from sklearn.svm import LinearSVC from sklearn.utils.testing import assert_allclose +from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV from imblearn.metrics import (sensitivity_score, specificity_score, geometric_mean_score, make_index_balanced_accuracy) -# Get the version -sk_version = sklearn.__version__ -if sk_version < '0.18': - from sklearn.cross_validation import train_test_split - from sklearn.grid_search import GridSearchCV -else: - from sklearn.model_selection import train_test_split, GridSearchCV R_TOL = 1e-2 +@pytest.mark.filterwarnings("ignore:Liblinear failed to converge") def test_imblearn_classification_scorers(): X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) @@ -33,76 +29,88 @@ def test_imblearn_classification_scorers(): # sensitivity scorer scorer = make_scorer(sensitivity_score, pos_label=None, average='macro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(sensitivity_score, pos_label=None, average='weighted') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(sensitivity_score, pos_label=None, average='micro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(sensitivity_score, pos_label=1) grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) # specificity scorer scorer = make_scorer(specificity_score, pos_label=None, average='macro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(specificity_score, pos_label=None, average='weighted') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(specificity_score, pos_label=None, average='micro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(specificity_score, pos_label=1) grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.95, rtol=R_TOL) # geometric_mean scorer scorer = make_scorer(geometric_mean_score, pos_label=None, average='macro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer( geometric_mean_score, pos_label=None, average='weighted') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(geometric_mean_score, pos_label=None, average='micro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) scorer = make_scorer(geometric_mean_score, pos_label=1) grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.92, rtol=R_TOL) @@ -110,24 +118,28 @@ def test_imblearn_classification_scorers(): geo_mean_iba = make_index_balanced_accuracy()(geometric_mean_score) scorer = make_scorer(geo_mean_iba, pos_label=None, average='macro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.85, rtol=R_TOL) scorer = make_scorer(geo_mean_iba, pos_label=None, average='weighted') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.85, rtol=R_TOL) scorer = make_scorer(geo_mean_iba, pos_label=None, average='micro') grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.85, rtol=R_TOL) scorer = make_scorer(geo_mean_iba, pos_label=1) grid = GridSearchCV( - LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer) + LinearSVC(random_state=0), param_grid={'C': [1, 10]}, scoring=scorer, + cv=3, iid=False) grid.fit(X_train, y_train).predict(X_test) assert_allclose(grid.best_score_, 0.84, rtol=R_TOL) diff --git a/imblearn/over_sampling/_smote.py b/imblearn/over_sampling/_smote.py index 34c56a560..95db0b44f 100644 --- a/imblearn/over_sampling/_smote.py +++ b/imblearn/over_sampling/_smote.py @@ -458,7 +458,8 @@ def _validate_estimator(self): self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) if self.svm_estimator is None: - self.svm_estimator_ = SVC(random_state=self.random_state) + self.svm_estimator_ = SVC(gamma='scale', + random_state=self.random_state) elif isinstance(self.svm_estimator, SVC): self.svm_estimator_ = clone(self.svm_estimator) else: @@ -714,7 +715,8 @@ def _validate_estimator(self): 'instead.', DeprecationWarning) if (self.svm_estimator is None or self.svm_estimator == 'deprecated'): - self.svm_estimator_ = SVC(random_state=self.random_state) + self.svm_estimator_ = SVC(gamma='scale', + random_state=self.random_state) elif isinstance(self.svm_estimator, SVC): self.svm_estimator_ = clone(self.svm_estimator) else: diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 4f42795df..674eb7021 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -264,7 +264,7 @@ def test_wrong_nn(): def test_sample_with_nn_svm(): kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) - svm = SVC(random_state=RND_SEED) + svm = SVC(gamma='scale', random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) X_resampled, y_resampled = smote.fit_resample(X, Y) @@ -336,7 +336,7 @@ def test_svm_smote(): svm_smote_nn = SVMSMOTE(random_state=42, k_neighbors=NearestNeighbors(n_neighbors=6), m_neighbors=NearestNeighbors(n_neighbors=11), - svm_estimator=SVC(random_state=42)) + svm_estimator=SVC(gamma='scale', random_state=42)) X_res_1, y_res_1 = svm_smote.fit_resample(X, Y) X_res_2, y_res_2 = svm_smote_nn.fit_resample(X, Y) diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index dccb8d324..a40499eda 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -18,8 +18,8 @@ from sklearn import pipeline from sklearn.base import clone from sklearn.externals import six -from sklearn.externals.joblib import Memory from sklearn.utils.metaestimators import if_delegate_has_method +from sklearn.utils.validation import check_memory __all__ = ['Pipeline', 'make_pipeline'] @@ -157,15 +157,7 @@ def _validate_steps(self): def _fit(self, X, y=None, **fit_params): self._validate_steps() # Setup the memory - memory = self.memory - if memory is None: - memory = Memory(cachedir=None, verbose=0) - elif isinstance(memory, six.string_types): - memory = Memory(cachedir=memory, verbose=0) - elif not isinstance(memory, Memory): - raise ValueError("'memory' should either be a string or" - " a joblib.Memory instance, got" - " 'memory={!r}' instead.".format(memory)) + memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) @@ -181,10 +173,20 @@ def _fit(self, X, y=None, **fit_params): if transformer is None: pass else: - if memory.cachedir is None: - # we do not clone when caching is disabled to preserve - # backward compatibility - cloned_transformer = transformer + if hasattr(memory, 'location'): + # joblib >= 0.12 + if memory.location is None: + # we do not clone when caching is disabled to + # preserve backward compatibility + cloned_transformer = transformer + else: + cloned_transformer = clone(transformer) + elif hasattr(memory, 'cachedir'): + # joblib < 0.11 + if memory.cachedir is None: + # we do not clone when caching is disabled to + # preserve backward compatibility + cloned_transformer = transformer else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py index 0f124e88c..f388e4d3d 100644 --- a/imblearn/tensorflow/_generator.py +++ b/imblearn/tensorflow/_generator.py @@ -13,6 +13,10 @@ from ..utils import Substitution from ..utils._docstring import _random_state_docstring +DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours', + 'RepeatedEditedNearestNeighbours', 'AllKNN', + 'NeighbourhoodCleaningRule', 'TomekLinks') + @Substitution(random_state=_random_state_docstring) def balanced_batch_generator(X, y, sample_weight=None, sampler=None, @@ -127,7 +131,9 @@ def balanced_batch_generator(X, y, sample_weight=None, sampler=None, "which has an attribute 'return_indices'.") sampler_ = clone(sampler) sampler_.set_params(return_indices=True) - set_random_state(sampler_, random_state) + # FIXME: Remove in 0.6 + if sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE: + set_random_state(sampler_, random_state) _, _, indices = sampler_.fit_resample(X, y) # shuffle the indices since the sampler are packing them by class diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py index 78eda3b1d..b22d17615 100644 --- a/imblearn/tensorflow/tests/test_generator.py +++ b/imblearn/tensorflow/tests/test_generator.py @@ -8,13 +8,14 @@ from imblearn.datasets import make_imbalance from imblearn.under_sampling import NearMiss +from imblearn.over_sampling import RandomOverSampler from imblearn.tensorflow import balanced_batch_generator tf = pytest.importorskip('tensorflow') -@pytest.mark.parametrize("sampler", [None, NearMiss()]) +@pytest.mark.parametrize("sampler", [None, NearMiss(), RandomOverSampler()]) def test_balanced_batch_generator(sampler): X, y = load_iris(return_X_y=True) X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 5ed953f28..2233c8f34 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -20,12 +20,22 @@ def test_all_estimator_no_base_class(name, Estimator): assert not name.lower().startswith('base'), msg +@pytest.mark.filterwarnings("ignore:'ratio' is deprecated from 0.4") +@pytest.mark.filterwarnings("ignore:'sampling_strategy' as a dict for") +@pytest.mark.filterwarnings("ignore:Class EasyEnsemble is deprecated") +@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') +@pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') +@pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') +@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') +@pytest.mark.filterwarnings("ignore:'y' should be of types") @pytest.mark.parametrize( 'name, Estimator', all_estimators(include_meta_estimators=True) ) def test_all_estimators(name, Estimator): - check_estimator(Estimator) + # don't run twice the sampler tests. Meta-estimator do not have a + # fit_resample method. + check_estimator(Estimator, run_sampler_tests=False) def _tested_non_meta_estimators(): @@ -42,11 +52,19 @@ def _generate_checks_per_estimator(check_generator, estimators): yield name, Estimator, check +@pytest.mark.filterwarnings("ignore:'ratio' is deprecated from 0.4") +@pytest.mark.filterwarnings("ignore:'sampling_strategy' as a dict for") +@pytest.mark.filterwarnings("ignore:Class EasyEnsemble is deprecated") +@pytest.mark.filterwarnings('ignore:"kind" is deprecated in 0.4 and will be') +@pytest.mark.filterwarnings('ignore:"svm_estimator" is deprecated in 0.4 and') +@pytest.mark.filterwarnings('ignore:"out_step" is deprecated in 0.4 and') +@pytest.mark.filterwarnings('ignore:"m_neighbors" is deprecated in 0.4 and') +@pytest.mark.filterwarnings("ignore:'y' should be of types") @pytest.mark.parametrize( 'name, Estimator, check', _generate_checks_per_estimator(_yield_all_checks, _tested_non_meta_estimators()) ) -def test_non_meta_estimators(name, Estimator, check): +def test_samplers(name, Estimator, check): # input validation etc for non-meta estimators check(name, Estimator) diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index 6033f0617..855383be6 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -191,7 +191,7 @@ def test_pipeline_init(): repr(pipe) # Test with two objects - clf = SVC() + clf = SVC(gamma='scale') filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) @@ -239,7 +239,7 @@ def test_pipeline_methods_anova(): X = iris.data y = iris.target # Test with Anova + LogisticRegression - clf = LogisticRegression() + clf = LogisticRegression(solver='lbfgs', multi_class='auto') filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) @@ -302,7 +302,7 @@ def test_pipeline_methods_pca_svm(): X = iris.data y = iris.target # Test with PCA + SVC - clf = SVC(probability=True, random_state=0) + clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) @@ -321,7 +321,8 @@ def test_pipeline_methods_preprocessing_svm(): n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) - clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') + clf = SVC(gamma='scale', probability=True, random_state=0, + decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) @@ -577,7 +578,9 @@ def test_classes_property(): with raises(AttributeError): getattr(reg, "classes_") - clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) + clf = make_pipeline(SelectKBest(k=1), + LogisticRegression(solver='lbfgs', multi_class='auto', + random_state=0)) with raises(AttributeError): getattr(clf, "classes_") clf.fit(X, y) @@ -593,9 +596,9 @@ def test_pipeline_wrong_memory(): # Define memory as an integer memory = 1 cached_pipe = Pipeline( - [('transf', DummyTransf()), ('svc', SVC())], memory=memory) - error_regex = ("'memory' should either be a string or a joblib.Memory" - " instance, got 'memory=1' instead.") + [('transf', DummyTransf()), ('svc', SVC(gamma='scale'))], + memory=memory) + error_regex = ("string or have the same interface as sklearn.utils.Memory") with raises(ValueError, match=error_regex): cached_pipe.fit(X, y) @@ -606,9 +609,9 @@ def test_pipeline_memory_transformer(): y = iris.target cachedir = mkdtemp() try: - memory = Memory(cachedir=cachedir, verbose=10) + memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC - clf = SVC(probability=True, random_state=0) + clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline( @@ -642,7 +645,7 @@ def test_pipeline_memory_transformer(): assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit - clf_2 = SVC(probability=True, random_state=0) + clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline( [('transf_2', transf_2), ('svc', clf_2)], memory=memory) @@ -676,9 +679,9 @@ def test_pipeline_memory_sampler(): random_state=0) cachedir = mkdtemp() try: - memory = Memory(cachedir=cachedir, verbose=10) + memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC - clf = SVC(probability=True, random_state=0) + clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline( @@ -712,7 +715,7 @@ def test_pipeline_memory_sampler(): assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit - clf_2 = SVC(probability=True, random_state=0) + clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline( [('transf_2', transf_2), ('svc', clf_2)], memory=memory) @@ -747,7 +750,7 @@ def test_pipeline_methods_pca_rus_svm(): random_state=0) # Test with PCA + SVC - clf = SVC(probability=True, random_state=0) + clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('pca', pca), ('rus', rus), ('svc', clf)]) @@ -773,7 +776,7 @@ def test_pipeline_methods_rus_pca_svm(): random_state=0) # Test with PCA + SVC - clf = SVC(probability=True, random_state=0) + clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA() rus = RandomUnderSampler(random_state=0) pipe = Pipeline([('rus', rus), ('pca', pca), ('svc', clf)]) @@ -858,7 +861,7 @@ def test_pipeline_none_classifier(): n_clusters_per_class=1, n_samples=5000, random_state=0) - clf = LogisticRegression(random_state=0) + clf = LogisticRegression(solver='lbfgs', random_state=0) pipe = make_pipeline(None, clf) pipe.fit(X, y) pipe.predict(X) @@ -880,7 +883,7 @@ def test_pipeline_none_sampler_classifier(): n_clusters_per_class=1, n_samples=5000, random_state=0) - clf = LogisticRegression(random_state=0) + clf = LogisticRegression(solver='lbfgs', random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(None, rus, clf) pipe.fit(X, y) @@ -903,7 +906,7 @@ def test_pipeline_sampler_none_classifier(): n_clusters_per_class=1, n_samples=5000, random_state=0) - clf = LogisticRegression(random_state=0) + clf = LogisticRegression(solver='lbfgs', random_state=0) rus = RandomUnderSampler(random_state=0) pipe = make_pipeline(rus, None, clf) pipe.fit(X, y) @@ -969,7 +972,7 @@ def test_pipeline_methods_anova_rus(): n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression - clf = LogisticRegression() + clf = LogisticRegression(solver='lbfgs') rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)]) @@ -994,7 +997,7 @@ def test_pipeline_with_step_that_implements_both_sample_and_transform(): n_samples=5000, random_state=0) - clf = LogisticRegression() + clf = LogisticRegression(solver='lbfgs') with raises(TypeError): Pipeline([('step', FitTransformSample()), ('logistic', clf)]) @@ -1013,7 +1016,7 @@ def test_pipeline_with_step_that_it_is_pipeline(): n_samples=5000, random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression - clf = LogisticRegression() + clf = LogisticRegression(solver='lbfgs') rus = RandomUnderSampler(random_state=0) filter1 = SelectKBest(f_classif, k=2) pipe1 = Pipeline([('rus', rus), ('anova', filter1)]) @@ -1074,10 +1077,11 @@ def test_pipeline_fit_then_sample_3_samplers_with_sampler_last_estimator(): def test_make_pipeline_memory(): cachedir = mkdtemp() try: - memory = Memory(cachedir=cachedir, verbose=10) - pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) + memory = Memory(cachedir, verbose=10) + pipeline = make_pipeline(DummyTransf(), SVC(gamma='scale'), + memory=memory) assert pipeline.memory is memory - pipeline = make_pipeline(DummyTransf(), SVC()) + pipeline = make_pipeline(DummyTransf(), SVC(gamma='scale')) assert pipeline.memory is None finally: shutil.rmtree(cachedir) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 69624db31..571cb3da8 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -17,15 +17,15 @@ from sklearn.model_selection import StratifiedKFold from sklearn.utils import safe_indexing -from ..base import BaseCleaningSampler +from ..base import BaseUnderSampler from ...utils import Substitution from ...utils._docstring import _random_state_docstring @Substitution( - sampling_strategy=BaseCleaningSampler._sampling_strategy_docstring, + sampling_strategy=BaseUnderSampler._sampling_strategy_docstring, random_state=_random_state_docstring) -class InstanceHardnessThreshold(BaseCleaningSampler): +class InstanceHardnessThreshold(BaseUnderSampler): """Class to perform under-sampling based on the instance hardness threshold. @@ -91,7 +91,7 @@ class InstanceHardnessThreshold(BaseCleaningSampler): >>> iht = InstanceHardnessThreshold(random_state=42) >>> X_res, y_res = iht.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) - Resampled dataset shape Counter({{1: 840, 0: 100}}) + Resampled dataset shape Counter({{1: 574, 0: 100}}) """ @@ -120,7 +120,8 @@ def _validate_estimator(self): self.estimator_ = clone(self.estimator) elif self.estimator is None: self.estimator_ = RandomForestClassifier( - random_state=self.random_state, n_jobs=self.n_jobs) + n_estimators=100, random_state=self.random_state, + n_jobs=self.n_jobs) else: raise ValueError('Invalid parameter `estimator`. Got {}.'.format( type(self.estimator))) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index c23f547e8..c3b1663ba 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -87,7 +87,7 @@ class OneSidedSelection(BaseCleaningSampler): >>> oss = OneSidedSelection(random_state=42) >>> X_res, y_res = oss.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) - Resampled dataset shape Counter({{1: 495, 0: 100}}) + Resampled dataset shape Counter({{1: 496, 0: 100}}) """ @@ -119,7 +119,7 @@ def _validate_estimator(self): self.estimator_ = clone(self.n_neighbors) else: raise ValueError('`n_neighbors` has to be a int or an object' - ' inhereited from KNeighborsClassifier.' + ' inherited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) def _fit_resample(self, X, y): @@ -135,10 +135,10 @@ def _fit_resample(self, X, y): if target_class in self.sampling_strategy_.keys(): # select a sample from the current class idx_maj = np.flatnonzero(y == target_class) - idx_maj_sample = idx_maj[random_state.randint( - low=0, - high=target_stats[target_class], - size=self.n_seeds_S)] + sel_idx_maj = random_state.randint( + low=0, high=target_stats[target_class], + size=self.n_seeds_S) + idx_maj_sample = idx_maj[sel_idx_maj] minority_class_indices = np.flatnonzero(y == class_minority) C_indices = np.append(minority_class_indices, idx_maj_sample) @@ -150,7 +150,7 @@ def _fit_resample(self, X, y): # create the set S with removing the seed from S # since that it will be added anyway - idx_maj_extracted = np.delete(idx_maj, idx_maj_sample, axis=0) + idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) S_x = safe_indexing(X, idx_maj_extracted) S_y = safe_indexing(y, idx_maj_extracted) self.estimator_.fit(C_x, C_y) @@ -169,7 +169,8 @@ def _fit_resample(self, X, y): # apply Tomek cleaning tl = TomekLinks( - sampling_strategy=self.sampling_strategy_, return_indices=True) + sampling_strategy=list(self.sampling_strategy_.keys()), + return_indices=True) X_cleaned, y_cleaned, idx_cleaned = tl.fit_resample( X_resampled, y_resampled) diff --git a/imblearn/utils/deprecation.py b/imblearn/utils/deprecation.py index 5b470652e..3aa74b2cd 100644 --- a/imblearn/utils/deprecation.py +++ b/imblearn/utils/deprecation.py @@ -33,24 +33,22 @@ def deprecate_parameter(sampler, None """ - warnings.simplefilter("always", DeprecationWarning) x, y = version_deprecation.split('.') version_removed = x + '.' + str(int(y) + 2) if new_param is None: if getattr(sampler, param_deprecated) is not None: warnings.warn( - "In the estimator {}, the parameter '{}' is" - " deprecated from {} and will be removed in" - " {}.".format(sampler.__class__, param_deprecated, - version_deprecation, version_removed), + "'{}' is deprecated from {} and will be removed in" + " {} for the estimator {}." + .format(param_deprecated, version_deprecation, + version_removed, sampler.__class__), category=DeprecationWarning) else: if getattr(sampler, param_deprecated) is not None: warnings.warn( - "In the estimator {}, the parameter '{}' is" - "deprecated from {} and will be removed in" - " {}. Use '{}' instead.".format( - sampler.__class__, param_deprecated, version_deprecation, - version_removed, new_param), + "'{}' is deprecated from {} and will be removed in" + " {} for the estimator {}. Use '{}' instead." + .format(param_deprecated, version_deprecation, + version_removed, sampler.__class__, new_param), category=DeprecationWarning) setattr(sampler, new_param, getattr(sampler, param_deprecated)) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index ac6cc2b21..26794638e 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -37,6 +37,10 @@ DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] +# FIXME: remove in 0.6 +DONT_HAVE_RANDOM_STATE = ('NearMiss', 'EditedNearestNeighbours', + 'RepeatedEditedNearestNeighbours', 'AllKNN', + 'NeighbourhoodCleaningRule', 'TomekLinks') def monkey_patch_check_dtype_object(name, estimator_orig): @@ -82,7 +86,7 @@ def _yield_all_checks(name, estimator): yield check -def check_estimator(Estimator): +def check_estimator(Estimator, run_sampler_tests=True): """Check if estimator adheres to scikit-learn conventions and imbalanced-learn @@ -94,7 +98,10 @@ def check_estimator(Estimator): Parameters ---------- Estimator : class - Class to check. Estimator is a class object (not an instance). + Class to check. Estimator is a class object (not an instance) + + run_sampler_tests=True : bool, default=True + Will run or not the samplers tests. """ name = Estimator.__name__ # monkey patch check_dtype_object for the sampler allowing strings @@ -104,15 +111,18 @@ def check_estimator(Estimator): # scikit-learn common tests sklearn_check_estimator(Estimator) check_parameters_default_constructible(name, Estimator) - for check in _yield_all_checks(name, Estimator): - check(name, Estimator) + if run_sampler_tests: + for check in _yield_all_checks(name, Estimator): + check(name, Estimator) def check_target_type(name, Estimator): X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) estimator = Estimator() - set_random_state(estimator) + # FIXME: in 0.6 set the random_state for all + if name not in DONT_HAVE_RANDOM_STATE: + set_random_state(estimator) with warns(UserWarning, match='should be of types'): estimator.fit(X, y) @@ -162,7 +172,13 @@ def check_samplers_fit_resample(name, Sampler): assert all(value >= n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseUnderSampler): n_samples = min(target_stats.values()) - assert all(value == n_samples for value in Counter(y_res).values()) + if name == 'InstanceHardnessThreshold': + # IHT does not enforce the number of samples but provide a number + # of samples the closest to the desired target. + assert all(Counter(y_res)[k] <= target_stats[k] + for k in target_stats.keys()) + else: + assert all(value == n_samples for value in Counter(y_res).values()) elif isinstance(sampler, BaseCleaningSampler): target_stats_res = Counter(y_res) class_minority = min(target_stats, key=target_stats.get) @@ -232,7 +248,7 @@ def check_samplers_sampling_strategy_fit_resample(name, Sampler): X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat elif isinstance(sampler, BaseCleaningSampler): - sampling_strategy = {2: 201, 0: 201} + sampling_strategy = [2, 0] sampler.set_params(sampling_strategy=sampling_strategy) X_res, y_res = sampler.fit_resample(X, y) assert Counter(y_res)[1] == expected_stat @@ -273,7 +289,9 @@ def check_samplers_sparse(name, Sampler): samplers = [Sampler()] for sampler in samplers: - set_random_state(sampler) + # FIXME: in 0.6 set the random_state for all + if name not in DONT_HAVE_RANDOM_STATE: + set_random_state(sampler) X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) X_res, y_res = sampler.fit_resample(X, y) if not isinstance(sampler, BaseEnsembleSampler): @@ -312,7 +330,9 @@ def check_samplers_pandas(name, Sampler): samplers = [Sampler()] for sampler in samplers: - set_random_state(sampler) + # FIXME: in 0.6 set the random_state for all + if name not in DONT_HAVE_RANDOM_STATE: + set_random_state(sampler) X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y) X_res, y_res = sampler.fit_resample(X, y) assert_allclose(X_res_pd, X_res) @@ -329,7 +349,9 @@ def check_samplers_multiclass_ova(name, Sampler): random_state=0) y_ova = label_binarize(y, np.unique(y)) sampler = Sampler() - set_random_state(sampler) + # FIXME: in 0.6 set the random_state for all + if name not in DONT_HAVE_RANDOM_STATE: + set_random_state(sampler) X_res, y_res = sampler.fit_resample(X, y) X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) @@ -353,7 +375,9 @@ def check_samplers_preserve_dtype(name, Sampler): X = X.astype(np.float32) y = y.astype(np.int32) sampler = Sampler() - set_random_state(sampler) + # FIXME: in 0.6 set the random_state for all + if name not in DONT_HAVE_RANDOM_STATE: + set_random_state(sampler) X_res, y_res = sampler.fit_resample(X, y) assert X.dtype == X_res.dtype, "X dtype is not preserved" assert y.dtype == y_res.dtype, "y dtype is not preserved" diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py index 50e25dfe8..134ee79c2 100644 --- a/imblearn/utils/tests/test_estimator_checks.py +++ b/imblearn/utils/tests/test_estimator_checks.py @@ -65,6 +65,8 @@ def _fit_resample(self, X, y): return X.astype(np.float64), y.astype(np.int64) +@pytest.mark.filterwarnings("ignore:'y' should be of types") +@pytest.mark.filterwarnings("ignore: Can't check dok sparse matrix for nan") @pytest.mark.parametrize( 'Estimator, err_type, err_msg', [(BaseBadSampler, AssertionError, "TypeError not raised by fit"), diff --git a/setup.cfg b/setup.cfg index 16c942c1a..c95cbb171 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,13 +2,13 @@ current_version = 0.4.0.dev0 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? -serialize = +serialize = {major}.{minor}.{patch}.{release}{dev} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = gamma -values = +values = dev gamma @@ -22,4 +22,9 @@ values = test = pytest [tool:pytest] -addopts = --doctest-modules +addopts = + --doctest-modules + +filterwarnings = + error::DeprecationWarning + error::FutureWarning \ No newline at end of file