scikit-learn-contrib · glemaitre · Jan 16, 2022 · Sep 2, 2021 · Sep 2, 2021 · Sep 9, 2021
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -207,6 +207,15 @@ jobs:
         TEST_DOCS: 'true'
         TEST_DOCSTRINGS: 'false'  # it is going to fail because of scikit-learn inheritance
         CHECK_WARNINGS: 'true'
+      pylatest_conda_cuml:
+        DISTRIB: 'mamba-cuml'
+        CONDA_CHANNEL: 'defaults'
+        PYTHON_VERSION: '3.8'
+        BLAZINGSQL_VERSION: 'min'
+        CUML_VERSION: 'min'
+        CUDATOOLKIT_VERSION: 'min'
+        TEST_DOCS: 'true'
+        TEST_DOCSTRINGS: 'true'
 
 - template: build_tools/azure/posix-docker.yml
   parameters:

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
@@ -6,8 +6,10 @@ set -x
 UNAMESTR=`uname`
 
 make_conda() {
+    conda update -yq conda
     TO_INSTALL="$@"
     if [[ "$DISTRIB" == *"mamba"* ]]; then
+        conda install -yq mamba -c conda-forge
         mamba create -n $VIRTUALENV --yes $TO_INSTALL
     else
         conda config --show
@@ -104,6 +106,16 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then
     TO_INSTALL="$TO_INSTALL $(get_dep keras $KERAS_VERSION)"
     make_conda $TO_INSTALL
 
+elif [[ "$DISTRIB" == "mamba-cuml" ]]; then
+    TO_INSTALL="-c rapidsai -c nvidia -c conda-forge python=$PYTHON_VERSION"
+    TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)"
+    TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)"
+    TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)"
+    TO_INSTALL="$TO_INSTALL $(get_dep blazingsql $BLAZINGSQL_VERSION)"
+    TO_INSTALL="$TO_INSTALL $(get_dep cuml $CUML_VERSION)"
+    TO_INSTALL="$TO_INSTALL $(get_dep cudatoolkit $CUDATOOLKIT_VERSION)"
+    make_conda $TO_INSTALL
+
 elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip

diff --git a/doc/developers_utils.rst b/doc/developers_utils.rst
@@ -29,7 +29,8 @@ which accepts arrays, matrices, or sparse matrices as arguments, the following
 should be used when applicable.
 
 - :func:`check_neighbors_object`: Check the objects is consistent to be a NN.
-- :func:`check_target_type`: Check the target types to be conform to the current sam  plers.
+- :func:`check_target_type`: Check the target types to be conform to the current
+  samplers.
 - :func:`check_sampling_strategy`: Checks that sampling target is onsistent with
   the type and return a dictionary containing each targeted class with its
   corresponding number of pixel.

diff --git a/imblearn/_min_dependencies.py b/imblearn/_min_dependencies.py
@@ -18,6 +18,9 @@
 JOBLIB_MIN_VERSION = "0.11"
 THREADPOOLCTL_MIN_VERSION = "2.0.0"
 PYTEST_MIN_VERSION = "5.0.1"
+CUML_MIN_VERSION = "21.10"
+BLAZINGSQL_MIN_VERSION = "21.10"
+CUDATOOLKIT_MIN_VERSION = "11.0"
 
 
 # 'build' and 'install' is included to have structured metadata for CI.
@@ -32,6 +35,9 @@
     "pandas": (PANDAS_MIN_VERSION, "optional, docs, examples, tests"),
     "tensorflow": (TENSORFLOW_MIN_VERSION, "optional, docs, examples, tests"),
     "keras": (KERAS_MIN_VERSION, "optional, docs, examples, tests"),
+    "cuml": (CUML_MIN_VERSION, "optional, docs, examples, tests"),
+    "blazingsql": (BLAZINGSQL_MIN_VERSION, "optional, docs, examples, tests"),
+    "cudatoolkit": (CUDATOOLKIT_MIN_VERSION, "optional, docs, examples, tests"),
     "matplotlib": ("2.2.3", "docs, examples"),
     "seaborn": ("0.9.0", "docs, examples"),
     "memory_profiler": ("0.57.0", "docs"),

diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py
@@ -15,7 +15,6 @@
 from sklearn.utils import _safe_indexing
 
 from ..base import BaseOverSampler
-from ...exceptions import raise_isinstance_error
 from ...utils import check_neighbors_object
 from ...utils import Substitution
 from ...utils._docstring import _n_jobs_docstring
@@ -278,6 +277,8 @@ class SVMSMOTE(BaseSMOTE):
 
     svm_estimator : estimator object, default=SVC()
         A parametrized :class:`~sklearn.svm.SVC` classifier can be passed.
+        A scikit-learn compatible estimator can be passed but it is required
+        to expose a `support_` fitted attribute.
 
     out_step : float, default=0.5
         Step size when extrapolating.
@@ -385,10 +386,8 @@ def _validate_estimator(self):
 
         if self.svm_estimator is None:
             self.svm_estimator_ = SVC(gamma="scale", random_state=self.random_state)
-        elif isinstance(self.svm_estimator, SVC):
-            self.svm_estimator_ = clone(self.svm_estimator)
         else:
-            raise_isinstance_error("svm_estimator", [SVC], self.svm_estimator)
+            self.svm_estimator_ = clone(self.svm_estimator)
 
     def _fit_resample(self, X, y):
         self._validate_estimator()
@@ -403,6 +402,12 @@ def _fit_resample(self, X, y):
             X_class = _safe_indexing(X, target_class_indices)
 
             self.svm_estimator_.fit(X, y)
+            if not hasattr(self.svm_estimator_, "support_"):
+                raise RuntimeError(
+                    "`svm_estimator` is required to exposed a `support_` fitted "
+                    "attribute. Such estimator belongs to the familly of Support "
+                    "Vector Machine."
+                )
             support_index = self.svm_estimator_.support_[
                 y[self.svm_estimator_.support_] == class_sample
             ]

diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py
@@ -164,3 +164,43 @@ def test_smote_m_neighbors(smote):
     _ = smote.fit_resample(X, Y)
     assert smote.nn_k_.n_neighbors == 6
     assert smote.nn_m_.n_neighbors == 11
+
+
+def test_sample_cuml_with_nn():
+    cuml = pytest.importorskip("cuml")
+    nn_k = cuml.neighbors.NearestNeighbors(n_neighbors=2)
+    smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k)
+    X_resampled, y_resampled = smote.fit_resample(X, Y)
+    X_gt = np.array(
+        [
+            [0.11622591, -0.0317206],
+            [0.77481731, 0.60935141],
+            [1.25192108, -0.22367336],
+            [0.53366841, -0.30312976],
+            [1.52091956, -0.49283504],
+            [-0.28162401, -2.10400981],
+            [0.83680821, 1.72827342],
+            [0.3084254, 0.33299982],
+            [0.70472253, -0.73309052],
+            [0.28893132, -0.38761769],
+            [1.15514042, 0.0129463],
+            [0.88407872, 0.35454207],
+            [1.31301027, -0.92648734],
+            [-1.11515198, -0.93689695],
+            [-0.18410027, -0.45194484],
+            [0.9281014, 0.53085498],
+            [-0.14374509, 0.27370049],
+            [-0.41635887, -0.38299653],
+            [0.08711622, 0.93259929],
+            [1.70580611, -0.11219234],
+            [1.10580062, 0.00601499],
+            [1.60506454, -0.31959815],
+            [1.40109204, -0.74276846],
+            [0.38584956, -0.20702218],
+        ]
+    )
+    y_gt = np.array(
+        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]
+    )
+    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
+    assert_array_equal(y_resampled, y_gt)
diff --git a/imblearn/over_sampling/_smote/tests/test_svm_smote.py b/imblearn/over_sampling/_smote/tests/test_svm_smote.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 
+from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import NearestNeighbors
 from sklearn.svm import SVC
 
@@ -54,3 +55,12 @@ def test_svm_smote(data):
 
     assert_allclose(X_res_1, X_res_2)
     assert_array_equal(y_res_1, y_res_2)
+
+
+def test_svm_smote_not_svm(data):
+    """Check that we raise a proper error if passing an estimator that does not
+    expose a `support_` fitted attribute."""
+
+    err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute."
+    with pytest.raises(RuntimeError, match=err_msg):
+        SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data)
diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py
@@ -131,10 +131,57 @@ def test_ada_fit_resample_nn_obj():
             {"sampling_strategy": {0: 9, 1: 12}},
             "No samples will be generated.",
         ),
-        ({"n_neighbors": "rnd"}, "has to be one of"),
+        (
+            {"n_neighbors": "rnd"},
+            "n_neighbors must be an interger or an object compatible with the "
+            "KNeighborsMixin API of scikit-learn",
+        ),
     ],
 )
 def test_adasyn_error(adasyn_params, err_msg):
     adasyn = ADASYN(**adasyn_params)
     with pytest.raises(ValueError, match=err_msg):
         adasyn.fit_resample(X, Y)
+
+
+def test_ada_fit_resample_cuml_nn_obj():
+    cuml = pytest.importorskip("cuml")
+    nn = cuml.neighbors.NearestNeighbors(n_neighbors=2)
+    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
+    X_resampled, y_resampled = ada.fit_resample(X, Y)
+    X_gt = np.array(
+        [
+            [0.11622591, -0.0317206],
+            [0.77481731, 0.60935141],
+            [1.25192108, -0.22367336],
+            [0.53366841, -0.30312976],
+            [1.52091956, -0.49283504],
+            [-0.28162401, -2.10400981],
+            [0.83680821, 1.72827342],
+            [0.3084254, 0.33299982],
+            [0.70472253, -0.73309052],
+            [0.28893132, -0.38761769],
+            [1.15514042, 0.0129463],
+            [0.88407872, 0.35454207],
+            [1.31301027, -0.92648734],
+            [-1.11515198, -0.93689695],
+            [-0.18410027, -0.45194484],
+            [0.9281014, 0.53085498],
+            [-0.14374509, 0.27370049],
+            [-0.41635887, -0.38299653],
+            [0.08711622, 0.93259929],
+            [1.70580611, -0.11219234],
+            [0.34532399, -0.18067361],
+            [1.44430593, -0.41617493],
+            [0.28204936, -0.13953426],
+            [1.08450984, 0.03948221],
+            [-0.19072677, -0.2341768],
+        ]
+    )
+
+    y_gt = np.array(
+        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]
+    )
+
+    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
+    assert_array_equal(y_resampled, y_gt)
diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py
@@ -49,7 +49,8 @@ class ClusterCentroids(BaseUnderSampler):
     {random_state}
 
     estimator : estimator object, default=None
-        Pass a :class:`~sklearn.cluster.KMeans` estimator. By default, it will
+        A scikit-learn compatible clustering method that exposes a `n_clusters`
+        parameter and a `cluster_centers_` fitted attribute. By default, it will
         be a default :class:`~sklearn.cluster.KMeans` estimator.
 
     voting : {{"hard", "soft", "auto"}}, default='auto'
@@ -141,13 +142,13 @@ def _validate_estimator(self):
             )
         if self.estimator is None:
             self.estimator_ = KMeans(random_state=self.random_state)
-        elif isinstance(self.estimator, KMeans):
-            self.estimator_ = clone(self.estimator)
         else:
-            raise ValueError(
-                f"`estimator` has to be a KMeans clustering."
-                f" Got {type(self.estimator)} instead."
-            )
+            self.estimator_ = clone(self.estimator)
+            if "n_clusters" not in self.estimator_.get_params():
+                raise ValueError(
+                    "`estimator` should be a clustering estimator exposing a parameter"
+                    " `n_clusters` and a fitted parameter `cluster_centers_`."
+                )
 
     def _generate_sample(self, X, y, centroids, target_class):
         if self.voting_ == "hard":
@@ -188,6 +189,11 @@ def _fit_resample(self, X, y):
                 n_samples = self.sampling_strategy_[target_class]
                 self.estimator_.set_params(**{"n_clusters": n_samples})
                 self.estimator_.fit(_safe_indexing(X, target_class_indices))
+                if not hasattr(self.estimator_, "cluster_centers_"):
+                    raise RuntimeError(
+                        "`estimator` should be a clustering estimator exposing a "
+                        "fitted parameter `cluster_centers_`."
+                    )
                 X_new, y_new = self._generate_sample(
                     _safe_indexing(X, target_class_indices),
                     _safe_indexing(y, target_class_indices),

diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py
@@ -5,6 +5,8 @@
 import numpy as np
 from scipy import sparse
 
+from sklearn.base import BaseEstimator
+from sklearn.linear_model import LogisticRegression
 from sklearn.cluster import KMeans
 from sklearn.datasets import make_classification
 
@@ -101,7 +103,6 @@ def test_fit_hard_voting():
 @pytest.mark.parametrize(
     "cluster_centroids_params, err_msg",
     [
-        ({"estimator": "rnd"}, "has to be a KMeans clustering"),
         ({"voting": "unknown"}, "needs to be one of"),
     ],
 )
@@ -152,3 +153,31 @@ def test_cluster_centroids_hard_target_class():
         for minority_sample in X_minority_class
     ]
     assert sum(sample_from_minority_in_majority) == 0
+
+
+class FakeCluster(BaseEstimator):
+    """Class that mimics a cluster that does not expose `cluster_centers_`."""
+
+    def __init__(self, n_clusters=1):
+        self.n_clusters = n_clusters
+
+    def fit(self, X, y=None):
+        return self
+
+
+def test_cluster_centroids_error_estimator():
+    """Check that an error is raised when estimator does not have a cluster API."""
+
+    err_msg = (
+        "`estimator` should be a clustering estimator exposing a parameter "
+        "`n_clusters` and a fitted parameter `cluster_centers_`."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        ClusterCentroids(estimator=LogisticRegression()).fit_resample(X, Y)
+
+    err_msg = (
+        "`estimator` should be a clustering estimator exposing a fitted parameter "
+        "`cluster_centers_`."
+    )
+    with pytest.raises(RuntimeError, match=err_msg):
+        ClusterCentroids(estimator=FakeCluster()).fit_resample(X, Y)
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py
@@ -126,7 +126,11 @@ def test_enn_fit_resample_with_nn_object():
 def test_enn_not_good_object():
     nn = "rnd"
     enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode")
-    with pytest.raises(ValueError, match="has to be one of"):
+    err_msg = (
+        "n_neighbors must be an interger or an object compatible with the "
+        "KNeighborsMixin API of scikit-learn"
+    )
+    with pytest.raises(ValueError, match=err_msg):
         enn.fit_resample(X, Y)
 
 

diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py
@@ -39,14 +39,17 @@
     "nearmiss_params, err_msg",
     [
         ({"version": 1000}, "must be 1, 2 or 3"),
-        ({"version": 1, "n_neighbors": "rnd"}, "has to be one of"),
+        (
+            {"version": 1, "n_neighbors": "rnd"},
+            "n_neighbors must be an interger or an object compatible",
+        ),
         (
             {
                 "version": 3,
                 "n_neighbors": NearestNeighbors(n_neighbors=3),
                 "n_neighbors_ver3": "rnd",
             },
-            "has to be one of",
+            "n_neighbors_ver3 must be an interger or an object compatible",
         ),
     ],
 )

diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py
@@ -37,7 +37,10 @@
     [
         ({"threshold_cleaning": -10}, "value between 0 and 1"),
         ({"threshold_cleaning": 10}, "value between 0 and 1"),
-        ({"n_neighbors": "rnd"}, "has to be one of"),
+        (
+            {"n_neighbors": "rnd"},
+            "n_neighbors must be an interger or an object compatible",
+        ),
     ],
 )
 def test_ncr_error(ncr_params, err_msg):