diff --git a/appveyor.yml b/appveyor.yml index a3f62b7e8..4fd636df6 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -36,7 +36,7 @@ install: - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" # Installed prebuilt dependencies from conda - - "conda install pip numpy scipy scikit-learn=0.19.0 nose wheel matplotlib -y -q" + - "conda install pip numpy scipy scikit-learn=0.19.0 pandas nose wheel matplotlib -y -q" # Install other nilearn dependencies - "pip install coverage nose-timer pytest pytest-cov" diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 843aa5088..2b590e860 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,7 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION pandas if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython @@ -59,7 +59,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # Create a new virtualenv using system site packages for python, numpy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install scikit-learn nose nose-timer pytest pytest-cov codecov + pip install scikit-learn pandas nose nose-timer pytest pytest-cov codecov fi diff --git a/doc/combine.rst b/doc/combine.rst index 73319004e..e4cdc0c23 100644 --- a/doc/combine.rst +++ b/doc/combine.rst @@ -29,18 +29,18 @@ than their former samplers:: ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) - >>> print(Counter(y)) - Counter({2: 4674, 1: 262, 0: 64}) + >>> print(sorted(Counter(y).items())) + [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.combine import SMOTEENN >>> smote_enn = SMOTEENN(random_state=0) >>> X_resampled, y_resampled = smote_enn.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({1: 4381, 0: 4060, 2: 3502}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 4060), (1, 4381), (2, 3502)] >>> from imblearn.combine import SMOTETomek >>> smote_tomek = SMOTETomek(random_state=0) >>> X_resampled, y_resampled = smote_tomek.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({1: 4566, 0: 4499, 2: 4413}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 4499), (1, 4566), (2, 4413)] We can also see in the example below that :class:`SMOTEENN` tends to clean more noisy samples than :class:`SMOTETomek`. diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 7e9ac08e4..47ed6c7c4 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -85,8 +85,8 @@ A specific data set can be selected as:: >>> ecoli = fetch_datasets()['ecoli'] >>> ecoli.data.shape (336, 7) - >>> print(Counter((ecoli.target))) - Counter({-1: 301, 1: 35}) + >>> print(sorted(Counter(ecoli.target).items())) + [(-1, 301), (1, 35)] .. _make_imbalanced: @@ -104,16 +104,16 @@ samples in the class:: >>> iris = load_iris() >>> ratio = {0: 20, 1: 30, 2: 40} >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ratio=ratio) - >>> Counter(y_imb) - Counter({2: 40, 1: 30, 0: 20}) + >>> sorted(Counter(y_imb).items()) + [(0, 20), (1, 30), (2, 40)] Note that all samples of a class are passed-through if the class is not mentioned in the dictionary:: >>> ratio = {0: 10} >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ratio=ratio) - >>> Counter(y_imb) - Counter({1: 50, 2: 50, 0: 10}) + >>> sorted(Counter(y_imb).items()) + [(0, 10), (1, 50), (2, 50)] Instead of a dictionary, a function can be defined and directly pass to ``ratio``:: @@ -126,9 +126,8 @@ Instead of a dictionary, a function can be defined and directly pass to ... return target_stats >>> X_imb, y_imb = make_imbalance(iris.data, iris.target, ... ratio=ratio_multiplier) - >>> Counter(y_imb) - Counter({2: 47, 1: 35, 0: 25}) - + >>> sorted(Counter(y_imb).items()) + [(0, 25), (1, 35), (2, 47)] See :ref:`sphx_glr_auto_examples_datasets_plot_make_imbalance.py` and :ref:`sphx_glr_auto_examples_plot_ratio_usage.py`. diff --git a/doc/ensemble.rst b/doc/ensemble.rst index 41d909567..01846e039 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -19,15 +19,15 @@ under-sampling the original set:: ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) - >>> print(Counter(y)) - Counter({2: 4674, 1: 262, 0: 64}) + >>> print(sorted(Counter(y).items())) + [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.ensemble import EasyEnsemble >>> ee = EasyEnsemble(random_state=0, n_subsets=10) >>> X_resampled, y_resampled = ee.fit_sample(X, y) >>> print(X_resampled.shape) (10, 192, 2) - >>> print(Counter(y_resampled[0])) # doctest: +SKIP - Counter({0: 64, 1: 64, 2: 64}) + >>> print(sorted(Counter(y_resampled[0]).items())) + [(0, 64), (1, 64), (2, 64)] :class:`EasyEnsemble` has two important parameters: (i) ``n_subsets`` will be used to return number of subset and (ii) ``replacement`` to randomly sample @@ -48,8 +48,8 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with >>> X_resampled, y_resampled = bc.fit_sample(X, y) >>> print(X_resampled.shape) (4, 192, 2) - >>> print(Counter(y_resampled[0])) # doctest: +SKIP - Counter({2: 64, 1: 64, 0: 64}) + >>> print(sorted(Counter(y_resampled[0]).items())) + [(0, 64), (1, 64), (2, 64)] See :ref:`sphx_glr_auto_examples_ensemble_plot_easy_ensemble.py` and diff --git a/doc/introduction.rst b/doc/introduction.rst new file mode 100644 index 000000000..0612c6807 --- /dev/null +++ b/doc/introduction.rst @@ -0,0 +1,61 @@ +.. _introduction: + +============ +Introduction +============ + +.. _api_imblearn: + +API's of imbalanced-learn samplers +---------------------------------- + +The available samplers follows the scikit-learn API using the base estimator and adding a sampling functionality throw the ``sample`` method:: + +:Estimator: + + The base object, implements a ``fit`` method to learn from data, either:: + + estimator = obj.fit(data, targets) + +:Sampler: + + To resample a data sets, each sampler implements:: + + data_resampled, targets_resampled = obj.sample(data, targets) + + Fitting and sampling can also be done in one step:: + + data_resampled, targets_resampled = obj.fit_sample(data, targets) + +Imbalanced-learn samplers accept the same inputs that in scikit-learn: + +* ``data``: array-like (2-D list, pandas.Dataframe, numpy.array) or sparse + matrices; +* ``targets``: array-like (1-D list, pandas.Series, numpy.array). + +.. topic:: Sparse input + + For sparse input the data is **converted to the Compressed Sparse Rows + representation** (see ``scipy.sparse.csr_matrix``) before being fed to the + sampler. To avoid unnecessary memory copies, it is recommended to choose the + CSR representation upstream. + +.. _problem_statement: + +Problem statement regarding imbalanced data sets +------------------------------------------------ + +The learning phase and the subsequent prediction of machine learning algorithms +can be affected by the problem of imbalanced data set. The balancing issue +corresponds to the difference of the number of samples in the different +classes. We illustrate the effect of training a linear SVM classifier with +different level of class balancing. + +.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png + :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html + :scale: 60 + :align: center + +As expected, the decision function of the linear SVM is highly impacted. With a +greater imbalanced ratio, the decision function favor the class with the larger +number of samples, usually referred as the majority class. diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index a8efdfa03..650d904fe 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -29,15 +29,15 @@ randomly sampling with replacement the current available samples. The >>> ros = RandomOverSampler(random_state=0) >>> X_resampled, y_resampled = ros.fit_sample(X, y) >>> from collections import Counter - >>> print(Counter(y_resampled)) # doctest: +SKIP - Counter({2: 4674, 1: 4674, 0: 4674}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 4674), (1, 4674), (2, 4674)] The augmented data set should be used instead of the original data set to train a classifier:: >>> from sklearn.svm import LinearSVC >>> clf = LinearSVC() - >>> clf.fit(X_resampled, y_resampled) # doctest: +ELLIPSIS + >>> clf.fit(X_resampled, y_resampled) # doctest : +ELLIPSIS LinearSVC(...) In the figure below, we compare the decision functions of a classifier trained @@ -67,12 +67,12 @@ can be used in the same manner:: >>> from imblearn.over_sampling import SMOTE, ADASYN >>> X_resampled, y_resampled = SMOTE().fit_sample(X, y) - >>> print(Counter(y_resampled)) # doctest: +SKIP - Counter({2: 4674, 1: 4674, 0: 4674}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 4674), (1, 4674), (2, 4674)] >>> clf_smote = LinearSVC().fit(X_resampled, y_resampled) >>> X_resampled, y_resampled = ADASYN().fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({2: 4674, 0: 4673, 1: 4662}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 4673), (1, 4662), (2, 4674)] >>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled) The figure below illustrates the major difference of the different over-sampling @@ -132,8 +132,8 @@ available: (i) ``'borderline1'``, (ii) ``'borderline2'``, and (iii) ``'svm'``:: >>> from imblearn.over_sampling import SMOTE, ADASYN >>> X_resampled, y_resampled = SMOTE(kind='borderline1').fit_sample(X, y) - >>> print(Counter(y_resampled)) # doctest: +SKIP - Counter({2: 4674, 1: 4674, 0: 4674}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 4674), (1, 4674), (2, 4674)] See :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py` to see a comparison between the different over-sampling methods. diff --git a/doc/problem_statement.rst b/doc/problem_statement.rst deleted file mode 100644 index 7b1a87e88..000000000 --- a/doc/problem_statement.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _problem_statement: - -================= -Problem statement -================= - -The learning phase and the subsequent prediction of machine learning algorithms -can be affected by the problem of imbalanced data set. The balancing issue -corresponds to the difference of the number of samples in the different -classes. We illustrate the effect of training a linear SVM classifier with -different level of class balancing. - -.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_001.png - :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html - :scale: 60 - :align: center - -As expected, the decision function of the linear SVM is highly impacted. With a -greater imbalanced ratio, the decision function favor the class with the larger -number of samples, usually referred as the majority class. diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 290b14614..9a2d20301 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -28,13 +28,13 @@ K-means method instead of the original samples:: ... n_clusters_per_class=1, ... weights=[0.01, 0.05, 0.94], ... class_sep=0.8, random_state=0) - >>> print(Counter(y)) - Counter({2: 4674, 1: 262, 0: 64}) + >>> print(sorted(Counter(y).items())) + [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import ClusterCentroids >>> cc = ClusterCentroids(random_state=0) >>> X_resampled, y_resampled = cc.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({0: 64, 1: 64, 2: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 64), (2, 64)] The figure below illustrates such under-sampling. @@ -49,6 +49,12 @@ your data are grouped into clusters. In addition, the number of centroids should be set such that the under-sampled clusters are representative of the original one. +.. warning:: + + :class:`ClusterCentroids` supports sparse matrices. However, the new samples + generated are not specifically sparse. Therefore, even if the resulting + matrix will be sparse, the algorithm will be inefficient in this regard. + See :ref:`sphx_glr_auto_examples_under-sampling_plot_cluster_centroids.py` and :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`. @@ -77,8 +83,8 @@ randomly selecting a subset of data for the targeted classes:: >>> from imblearn.under_sampling import RandomUnderSampler >>> rus = RandomUnderSampler(random_state=0) >>> X_resampled, y_resampled = rus.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({0: 64, 1: 64, 2: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 64), (2, 64)] .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_002.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html @@ -108,8 +114,8 @@ be selected with the parameter ``version``:: >>> from imblearn.under_sampling import NearMiss >>> nm1 = NearMiss(random_state=0, version=1) >>> X_resampled_nm1, y_resampled = nm1.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({0: 64, 1: 64, 2: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 64), (2, 64)] As later stated in the next section, :class:`NearMiss` heuristic rules are based on nearest neighbors algorithm. Therefore, the parameters ``n_neighbors`` @@ -238,13 +244,13 @@ available: (i) the majority (i.e., ``kind_sel='mode'``) or (ii) all (i.e., ``kind_sel='all'``) the nearest-neighbors have to belong to the same class than the sample inspected to keep it in the dataset:: - >>> Counter(y) - Counter({2: 4674, 1: 262, 0: 64}) + >>> sorted(Counter(y).items()) + [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import EditedNearestNeighbours >>> enn = EditedNearestNeighbours(random_state=0) >>> X_resampled, y_resampled = enn.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({2: 4568, 1: 213, 0: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 213), (2, 4568)] The parameter ``n_neighbors`` allows to give a classifier subclassed from ``KNeighborsMixin`` from scikit-learn to find the nearest neighbors and make @@ -257,8 +263,8 @@ Generally, repeating the algorithm will delete more data:: >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> renn = RepeatedEditedNearestNeighbours(random_state=0) >>> X_resampled, y_resampled = renn.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({2: 4551, 1: 208, 0: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 208), (2, 4551)] :class:`AllKNN` differs from the previous :class:`RepeatedEditedNearestNeighbours` since the number of neighbors of the @@ -267,8 +273,8 @@ internal nearest neighbors algorithm is increased at each iteration:: >>> from imblearn.under_sampling import AllKNN >>> allknn = AllKNN(random_state=0) >>> X_resampled, y_resampled = allknn.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({2: 4601, 1: 220, 0: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 220), (2, 4601)] In the example below, it can be seen that the three algorithms have similar impact by cleaning noisy samples next to the boundaries of the classes. @@ -305,8 +311,8 @@ The :class:`CondensedNearestNeighbour` can be used in the following manner:: >>> from imblearn.under_sampling import CondensedNearestNeighbour >>> cnn = CondensedNearestNeighbour(random_state=0) >>> X_resampled, y_resampled = cnn.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({2: 116, 0: 64, 1: 25}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 24), (2, 115)] However as illustrated in the figure below, :class:`CondensedNearestNeighbour` is sensitive to noise and will add noisy samples. @@ -320,8 +326,8 @@ used as:: >>> from imblearn.under_sampling import OneSidedSelection >>> oss = OneSidedSelection(random_state=0) >>> X_resampled, y_resampled = oss.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({2: 4403, 1: 174, 0: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 174), (2, 4403)] Our implementation offer to set the number of seeds to put in the set :math:`C` originally by setting the parameter ``n_seeds_S``. @@ -334,8 +340,8 @@ neighbors classifier. The class can be used as:: >>> from imblearn.under_sampling import NeighbourhoodCleaningRule >>> ncr = NeighbourhoodCleaningRule(random_state=0) >>> X_resampled, y_resampled = ncr.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({2: 4666, 1: 234, 0: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 234), (2, 4666)] .. image:: ./auto_examples/under-sampling/images/sphx_glr_plot_comparison_under_sampling_005.png :target: ./auto_examples/under-sampling/plot_comparison_under_sampling.html @@ -362,8 +368,8 @@ removed. The class can be used as:: >>> iht = InstanceHardnessThreshold(random_state=0, ... estimator=LogisticRegression()) >>> X_resampled, y_resampled = iht.fit_sample(X, y) - >>> print(Counter(y_resampled)) - Counter({0: 64, 1: 64, 2: 64}) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 64), (1, 64), (2, 64)] This class has 2 important parameters. ``estimator`` will accept any scikit-learn classifier which has a method ``predict_proba``. The classifier diff --git a/doc/user_guide.rst b/doc/user_guide.rst index c4beeb7b8..8bd86d336 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -9,7 +9,7 @@ User Guide .. toctree:: :numbered: - problem_statement.rst + introduction.rst over_sampling.rst under_sampling.rst combine.rst diff --git a/doc/whats_new.rst b/doc/whats_new.rst index f074cfa58..142668f27 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -49,6 +49,9 @@ New features Enhancement ~~~~~~~~~~~ +- All samplers accepts sparse matrices with defaulting on CSR type. By + `Guillaume Lemaitre`_. + - :func:`datasets.make_imbalance` take a ratio similarly to other samplers. It supports multiclass. By `Guillaume Lemaitre`_. diff --git a/examples/applications/plot_topic_classication.py b/examples/applications/plot_topic_classication.py index 90e48f0c3..e0af19ccf 100644 --- a/examples/applications/plot_topic_classication.py +++ b/examples/applications/plot_topic_classication.py @@ -16,7 +16,6 @@ from collections import Counter from sklearn.datasets import fetch_20newsgroups -from sklearn.preprocessing import FunctionTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline @@ -82,22 +81,10 @@ # use a ``RandomUnderSampler`` to equalize the number of samples in all the # classes before the training. # -# Currently, imbalanced-learn does not handle sparse matrices --- we are -# currently working on bringing this feature --- and an additional transformer -# to convert the sparse to dense matrices is required in the pipeline. -# # It is also important to note that we are using the ``make_pipeline`` function # implemented in imbalanced-learn to properly handle the samplers. - -def densify(X): - """Function to densify an array.""" - return X.toarray() - - pipe = make_pipeline_imb(TfidfVectorizer(), - FunctionTransformer(func=densify, - accept_sparse=True), RandomUnderSampler(), MultinomialNB()) diff --git a/imblearn/base.py b/imblearn/base.py index af3d0536d..05d79f35a 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -38,15 +38,16 @@ def sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) @@ -55,7 +56,7 @@ def sample(self, X, y): """ # Check the consistency of X and y - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) check_is_fitted(self, 'ratio_') self._check_X_y(X, y) @@ -67,18 +68,19 @@ def fit_sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {array-like, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : array-like, shape (n_samples_new,) The corresponding label of `X_resampled` """ @@ -91,19 +93,21 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` + """ pass @@ -138,10 +142,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -150,7 +154,7 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.X_hash_, self.y_hash_ = hash_X_y(X, y) # self.sampling_type is already checked in check_ratio diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index 32ce1e49d..e1e094c32 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -281,10 +281,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -293,7 +293,7 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) @@ -305,15 +305,16 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 08c9f20fd..82821df0c 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -232,10 +232,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -244,7 +244,7 @@ def fit(self, X, y): Return self. """ - X, y = check_X_y(X, y) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) y = check_target_type(y) self.ratio_ = self.ratio self.X_hash_, self.y_hash_ = hash_X_y(X, y) @@ -256,18 +256,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index f88c873ed..20276a79a 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -12,7 +12,7 @@ from sklearn.base import ClassifierMixin from sklearn.neighbors import KNeighborsClassifier -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from sklearn.externals.six import string_types from sklearn.model_selection import cross_val_predict @@ -149,10 +149,10 @@ def fit(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns @@ -222,15 +222,16 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_subset, n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_subset, n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_subset, n_samples_new) @@ -249,22 +250,16 @@ def _sample(self, X, y): samples_mask = np.ones(y.shape, dtype=bool) # where the different set will be stored - X_resampled = [] - y_resampled = [] idx_under = [] n_subsets = 0 b_subset_search = True while b_subset_search: - target_stats = Counter(y[samples_mask]) - # build the data set to be classified - X_subset = np.empty((0, X.shape[1]), dtype=X.dtype) - y_subset = np.empty((0, ), dtype=y.dtype) + target_stats = Counter(safe_indexing( + y, np.flatnonzero(samples_mask))) # store the index of the data to under-sample index_under_sample = np.empty((0, ), dtype=y.dtype) # value which will be picked at each round - X_constant = np.empty((0, X.shape[1]), dtype=X.dtype) - y_constant = np.empty((0, ), dtype=y.dtype) index_constant = np.empty((0, ), dtype=y.dtype) for target_class in target_stats.keys(): if target_class in self.ratio_.keys(): @@ -274,29 +269,15 @@ def _sample(self, X, y): index_class = np.flatnonzero(y == target_class) index_class_interest = index_class[samples_mask[ y == target_class]] - X_class = X[index_class_interest] - y_class = y[index_class_interest] + y_class = safe_indexing(y, index_class_interest) # select randomly the desired features index_target_class = random_state.choice( range(y_class.size), size=n_samples, replace=False) - X_subset = np.concatenate((X_subset, - X_class[index_target_class]), - axis=0) - y_subset = np.concatenate((y_subset, - y_class[index_target_class]), - axis=0) - # index of the data index_under_sample = np.concatenate( (index_under_sample, index_class_interest[index_target_class]), axis=0) else: - X_constant = np.concatenate((X_constant, - X[y == target_class]), - axis=0) - y_constant = np.concatenate((y_constant, - y[y == target_class]), - axis=0) index_constant = np.concatenate( (index_constant, np.flatnonzero(y == target_class)), @@ -304,23 +285,19 @@ def _sample(self, X, y): # store the set created n_subsets += 1 - X_resampled.append(np.concatenate((X_subset, X_constant), - axis=0)) - y_resampled.append(np.concatenate((y_subset, y_constant), - axis=0)) - idx_under.append(np.concatenate((index_under_sample, - index_constant), - axis=0)) + subset_indices = np.concatenate((index_under_sample, + index_constant), axis=0) + idx_under.append(subset_indices) # fit and predict using cross validation - pred = cross_val_predict(self.estimator_, - np.concatenate((X_subset, X_constant), - axis=0), - np.concatenate((y_subset, y_constant), - axis=0)) + X_subset = safe_indexing(X, subset_indices) + y_subset = safe_indexing(y, subset_indices) + pred = cross_val_predict(self.estimator_, X_subset, y_subset) # extract the prediction about the targeted classes only - pred_target = pred[:y_subset.size] - index_classified = index_under_sample[pred_target == y_subset] + pred_target = pred[:index_under_sample.size] + index_classified = index_under_sample[ + pred_target == safe_indexing(y_subset, + range(index_under_sample.size))] samples_mask[index_classified] = False # check the stopping criterion @@ -328,11 +305,17 @@ def _sample(self, X, y): if n_subsets == self.n_max_subset: b_subset_search = False # check that there is enough samples for another round - target_stats = Counter(y[samples_mask]) + target_stats = Counter(safe_indexing( + y, np.flatnonzero(samples_mask))) for target_class in self.ratio_.keys(): if target_stats[target_class] < self.ratio_[target_class]: b_subset_search = False + X_resampled, y_resampled = [], [] + for indices in idx_under: + X_resampled.append(safe_indexing(X, indices)) + y_resampled.append(safe_indexing(y, indices)) + if self.return_indices: return (np.array(X_resampled), np.array(y_resampled), np.array(idx_under)) diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index 9a3fff860..5fc018167 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -112,15 +112,16 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_subset, n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_subset, n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_subset, n_samples_new) diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index 3f16d0d53..e15bfa62b 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -7,7 +7,9 @@ from __future__ import division import numpy as np -from sklearn.utils import check_random_state +from scipy import sparse + +from sklearn.utils import check_random_state, safe_indexing from .base import BaseOverSampler from ..utils import check_neighbors_object @@ -130,20 +132,22 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` + """ self._validate_estimator() random_state = check_random_state(self.random_state) @@ -154,7 +158,8 @@ def _sample(self, X, y): for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.nn_.fit(X) _, nn_index = self.nn_.kneighbors(X_class) @@ -171,27 +176,57 @@ def _sample(self, X, y): ' Use SMOTE instead.') ratio_nn /= np.sum(ratio_nn) n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) + if not np.sum(n_samples_generate): + raise ValueError("No samples will be generated with the" + " provided ratio settings.") # the nearest neighbors need to be fitted only on the current class # to find the class NN to generate new samples self.nn_.fit(X_class) _, nn_index = self.nn_.kneighbors(X_class) - x_class_gen = [] - for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, - n_samples_generate): - if num_sample_i == 0: - continue - nn_zs = random_state.randint( - 1, high=self.nn_.n_neighbors, size=num_sample_i) - steps = random_state.uniform(size=len(nn_zs)) - x_class_gen.append([x_i + step * (X[x_i_nn[nn_z], :] - x_i) - for step, nn_z in zip(steps, nn_zs)]) - - if len(x_class_gen) > 0: - X_resampled = np.vstack((X_resampled, - np.concatenate(x_class_gen))) - y_resampled = np.hstack((y_resampled, [class_sample] * - np.sum(n_samples_generate))) + if sparse.issparse(X): + row_indices, col_indices, samples = [], [], [] + n_samples_generated = 0 + for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, + n_samples_generate): + if num_sample_i == 0: + continue + nn_zs = random_state.randint( + 1, high=self.nn_.n_neighbors, size=num_sample_i) + steps = random_state.uniform(size=len(nn_zs)) + if x_i.nnz: + for step, nn_z in zip(steps, nn_zs): + sample = x_i + step * (X[x_i_nn[nn_z], :] - x_i) + row_indices += ([n_samples_generated] * + len(sample.indices)) + col_indices += sample.indices.tolist() + samples += sample.data.tolist() + n_samples_generated += 1 + X_new = (sparse.csr_matrix((samples, + (row_indices, col_indices)), + [np.sum(n_samples_generate), + X.shape[1]])) + y_new = np.array([class_sample] * np.sum(n_samples_generate)) + else: + x_class_gen = [] + for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, + n_samples_generate): + if num_sample_i == 0: + continue + nn_zs = random_state.randint( + 1, high=self.nn_.n_neighbors, size=num_sample_i) + steps = random_state.uniform(size=len(nn_zs)) + x_class_gen.append([x_i + step * (X[x_i_nn[nn_z], :] - x_i) + for step, nn_z in zip(steps, nn_zs)]) + + X_new = np.concatenate(x_class_gen) + y_new = np.array([class_sample] * np.sum(n_samples_generate)) + + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 9c1f6d51b..883fd9be2 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -5,6 +5,8 @@ # Christos Aridas # License: MIT +from sklearn.utils import check_X_y + from ..base import BaseSampler diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index 9b164eee7..271f1f6e8 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -8,7 +8,7 @@ from collections import Counter import numpy as np -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from .base import BaseOverSampler @@ -84,37 +84,34 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ random_state = check_random_state(self.random_state) target_stats = Counter(y) - X_resampled = X.copy() - y_resampled = y.copy() + sample_indices = range(X.shape[0]) for class_sample, num_samples in self.ratio_.items(): - index_samples = random_state.randint( + target_class_indices = np.flatnonzero(y == class_sample) + indices = random_state.randint( low=0, high=target_stats[class_sample], size=num_samples) - X_resampled = np.concatenate((X_resampled, - X[y == class_sample][index_samples]), - axis=0) + sample_indices = np.append(sample_indices, + target_class_indices[indices]) - y_resampled = np.concatenate((y_resampled, - y[y == class_sample][index_samples]), - axis=0) - - return X_resampled, y_resampled + return (safe_indexing(X, sample_indices), + safe_indexing(y, sample_indices)) diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 7902d178d..fabe63b42 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -8,8 +8,11 @@ from __future__ import division import numpy as np + +from scipy import sparse + from sklearn.svm import SVC -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from .base import BaseOverSampler from ..exceptions import raise_isinstance_error @@ -175,13 +178,13 @@ def _in_danger_noise(self, samples, target_class, y, kind='danger'): Parameters ---------- - samples : ndarray, shape (n_samples, n_features) + samples : {array-like, sparse matrix}, shape (n_samples, n_features) The samples to check if either they are in danger or not. target_class : int or str, The target corresponding class being over-sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) The true label in order to check the neighbour labels. kind : str, optional (default='danger') @@ -192,7 +195,7 @@ def _in_danger_noise(self, samples, target_class, y, kind='danger'): Returns ------- - output : ndarray, shape (n_samples, ) + output : ndarray, shape (n_samples,) A boolean array where True refer to samples in danger or noise. """ @@ -223,7 +226,7 @@ def _make_samples(self, Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int @@ -245,26 +248,42 @@ def _make_samples(self, Returns ------- - X_new : ndarray, shape (n_samples_new, n_features) + X_new : {ndarray, sparse matrix}, shape (n_samples_new, n_features) Synthetically generated samples. - y_new : ndarray, shape (n_samples_new, ) + y_new : ndarray, shape (n_samples_new,) Target values for synthetic samples. """ random_state = check_random_state(self.random_state) - X_new = np.zeros((n_samples, X.shape[1])) - samples = random_state.randint( + samples_indices = random_state.randint( low=0, high=len(nn_num.flatten()), size=n_samples) steps = step_size * random_state.uniform(size=n_samples) - rows = np.floor_divide(samples, nn_num.shape[1]) - cols = np.mod(samples, nn_num.shape[1]) - for i, (sample, row, col, step) in enumerate(zip(samples, rows, - cols, steps)): - X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) - y_new = np.array([y_type] * len(X_new)) + rows = np.floor_divide(samples_indices, nn_num.shape[1]) + cols = np.mod(samples_indices, nn_num.shape[1]) + + if sparse.issparse(X): + row_indices, col_indices, samples = [], [], [] + for i, (row, col, step) in enumerate(zip(rows, cols, steps)): + if X[row].nnz: + sample = X[row] - step * (X[row] - + nn_data[nn_num[row, col]]) + row_indices += [i] * len(sample.indices) + col_indices += sample.indices.tolist() + samples += sample.data.tolist() + else: + X_new = np.zeros((n_samples, X.shape[1])) + for i, (row, col, step) in enumerate(zip(rows, cols, steps)): + X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) + + y_new = np.array([y_type] * len(samples_indices)) - return X_new, y_new + if sparse.issparse(X): + return (sparse.csr_matrix((samples, (row_indices, col_indices)), + [len(samples_indices), X.shape[1]]), + y_new) + else: + return X_new, y_new def _validate_estimator(self): """Create the necessary objects for SMOTE.""" @@ -305,19 +324,20 @@ def _sample_regular(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled`. + y_resampled : ndarray, shape (n_samples_new,) + The corresponding label of `X_resampled` References ---------- @@ -326,21 +346,26 @@ def _sample_regular(self, X, y): intelligence research, 321-357, 2002. """ + X_resampled = X.copy() y_resampled = y.copy() for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] X_new, y_new = self._make_samples(X_class, class_sample, X_class, nns, n_samples, 1.0) - X_resampled = np.concatenate((X_resampled, X_new), axis=0) - y_resampled = np.concatenate((y_resampled, y_new), axis=0) + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) return X_resampled, y_resampled @@ -354,19 +379,20 @@ def _sample_borderline(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled`. + y_resampled : ndarray, shape (n_samples_new,) + The corresponding label of `X_resampled` References ---------- @@ -381,7 +407,8 @@ def _sample_borderline(self, X, y): for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.nn_m_.fit(X) danger_index = self._in_danger_noise(X_class, class_sample, y, @@ -391,16 +418,21 @@ def _sample_borderline(self, X, y): self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors( - X_class[danger_index], return_distance=False)[:, 1:] + safe_indexing(X_class, danger_index), + return_distance=False)[:, 1:] # divergence between borderline-1 and borderline-2 if self.kind == 'borderline1': # Create synthetic samples for borderline points. - X_new, y_new = self._make_samples(X_class[danger_index], + X_new, y_new = self._make_samples(safe_indexing(X_class, + danger_index), class_sample, X_class, nns, n_samples) - X_resampled = np.concatenate((X_resampled, X_new), axis=0) - y_resampled = np.concatenate((y_resampled, y_new), axis=0) + if sparse.issparse(X_new): + X_resampled = sparse.vstack([X_resampled, X_new]) + else: + X_resampled = np.vstack((X_resampled, X_new)) + y_resampled = np.hstack((y_resampled, y_new)) else: random_state = check_random_state(self.random_state) @@ -408,22 +440,26 @@ def _sample_borderline(self, X, y): # only minority X_new_1, y_new_1 = self._make_samples( - X_class[danger_index], class_sample, X_class, nns, + safe_indexing(X_class, danger_index), class_sample, + X_class, nns, int(fractions * (n_samples + 1)), step_size=1.) # we use a one-vs-rest policy to handle the multiclass in which # new samples will be created considering not only the majority # class but all over classes. X_new_2, y_new_2 = self._make_samples( - X_class[danger_index], class_sample, X[y != class_sample], + safe_indexing(X_class, danger_index), class_sample, + safe_indexing(X, np.flatnonzero(y != class_sample)), nns, int((1 - fractions) * n_samples), step_size=0.5) - # Concatenate the newly generated samples to the original - # data set - X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), - axis=0) - y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), - axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, + X_new_1, X_new_2]) + else: + X_resampled = np.vstack((X_resampled, + X_new_1, X_new_2)) + y_resampled = np.hstack((y_resampled, + y_new_1, y_new_2)) return X_resampled, y_resampled @@ -435,19 +471,20 @@ def _sample_svm(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled`. + y_resampled : ndarray, shape (n_samples_new,) + The corresponding label of `X_resampled` References ---------- @@ -463,17 +500,20 @@ def _sample_svm(self, X, y): for class_sample, n_samples in self.ratio_.items(): if n_samples == 0: continue - X_class = X[y == class_sample] + target_class_indices = np.flatnonzero(y == class_sample) + X_class = safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) support_index = self.svm_estimator_.support_[ y[self.svm_estimator_.support_] == class_sample] - support_vector = X[support_index] + support_vector = safe_indexing(X, support_index) self.nn_m_.fit(X) noise_bool = self._in_danger_noise(support_vector, class_sample, y, kind='noise') - support_vector = support_vector[np.logical_not(noise_bool)] + support_vector = safe_indexing( + support_vector, + np.flatnonzero(np.logical_not(noise_bool))) danger_bool = self._in_danger_noise(support_vector, class_sample, y, kind='danger') safety_bool = np.logical_not(danger_bool) @@ -481,33 +521,48 @@ def _sample_svm(self, X, y): self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) if np.count_nonzero(danger_bool) > 0: - nns = self.nn_k_.kneighbors(support_vector[danger_bool], + nns = self.nn_k_.kneighbors(safe_indexing( + support_vector, + np.flatnonzero(danger_bool)), return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( - support_vector[danger_bool], class_sample, X_class, + safe_indexing(support_vector, np.flatnonzero(danger_bool)), + class_sample, X_class, nns, int(fractions * (n_samples + 1)), step_size=1.) if np.count_nonzero(safety_bool) > 0: - nns = self.nn_k_.kneighbors(support_vector[safety_bool], - return_distance=False)[:, 1:] + nns = self.nn_k_.kneighbors( + safe_indexing(support_vector, np.flatnonzero(safety_bool)), + return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( - support_vector[safety_bool], class_sample, X_class, + safe_indexing(support_vector, np.flatnonzero(safety_bool)), + class_sample, X_class, nns, int((1 - fractions) * n_samples), step_size=-self.out_step) if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): - X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), - axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, + X_new_1, X_new_2]) + else: + X_resampled = np.vstack((X_resampled, + X_new_1, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), axis=0) elif np.count_nonzero(danger_bool) == 0: - X_resampled = np.concatenate((X_resampled, X_new_2), axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, X_new_2]) + else: + X_resampled = np.vstack((X_resampled, X_new_2)) y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) elif np.count_nonzero(safety_bool) == 0: - X_resampled = np.concatenate((X_resampled, X_new_1), axis=0) + if sparse.issparse(X_resampled): + X_resampled = sparse.vstack([X_resampled, X_new_1]) + else: + X_resampled = np.vstack((X_resampled, X_new_1)) y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) return X_resampled, y_resampled @@ -517,18 +572,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index c550d567f..dfe50ef4a 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -75,34 +75,11 @@ def test_ada_fit_sample(): assert_array_equal(y_resampled, y_gt) -def test_ada_fit_sample_half(): +def test_ada_fit_ratio_error(): ratio = 0.8 ada = ADASYN(ratio=ratio, random_state=RND_SEED) - X_resampled, y_resampled = ada.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234]]) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) + with raises(ValueError, match="No samples will be generated."): + ada.fit_sample(X, Y) def test_ada_fit_sample_nn_obj(): diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 0eef20cde..0cfebb193 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -9,7 +9,10 @@ from __future__ import division, print_function import numpy as np +from scipy import sparse + from sklearn.cluster import KMeans +from sklearn.utils import safe_indexing from ..base import BaseUnderSampler @@ -79,7 +82,8 @@ class ClusterCentroids(BaseUnderSampler): >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({0: 100, 1: 100}) + ... # doctest: +ELLIPSIS + Resampled dataset shape Counter({...}) """ @@ -109,42 +113,46 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ self._validate_estimator() - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - + idx_under = np.empty((0, ), dtype=int) + centroids, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) - centroids = self.estimator_.cluster_centers_ + centroids.append(self.estimator_.cluster_centers_) + y_resampled += [target_class] * n_samples - X_resampled = np.concatenate((X_resampled, centroids), axis=0) - y_resampled = np.concatenate( - (y_resampled, np.array([target_class] * n_samples)), - axis=0) else: + target_class_indices = np.flatnonzero(y == target_class) + idx_under = np.concatenate( + (idx_under, target_class_indices), axis=0) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class]), axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class]), axis=0) + X_resampled = np.concatenate((centroids)) + + if sparse.issparse(X): + X_resampled = sparse.vstack([sparse.csr_matrix(X_resampled), + safe_indexing(X, idx_under)]) + else: + X_resampled = np.vstack((X_resampled, safe_indexing(X, idx_under))) + y_resampled = np.hstack((y_resampled, safe_indexing(y, idx_under))) - return X_resampled, y_resampled + return X_resampled, np.array(y_resampled) diff --git a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py index 160e0df69..09e634832 100644 --- a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py @@ -27,10 +27,13 @@ def test_fit_sample_auto(): ratio = 'auto' cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.06738818, -0.529627], - [0.17901516, 0.69860992], [0.094035, -2.55298982]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) + X_gt = np.array([[0.06738818, -0.529627], + [0.17901516, 0.69860992], + [0.094035, -2.55298982], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502]]) + y_gt = np.array([1, 1, 1, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -39,12 +42,16 @@ def test_fit_sample_half(): ratio = .5 cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.09125309, -0.85409574], - [0.19220316, 0.32337101], [0.094035, -2.55298982], - [0.20792588, 1.49407907], [0.04352327, -0.20515826], - [0.12372842, 0.6536186]]) - y_gt = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1]) + X_gt = np.array([[0.09125309, -0.85409574], + [0.19220316, 0.32337101], + [0.094035, -2.55298982], + [0.20792588, 1.49407907], + [0.04352327, -0.20515826], + [0.12372842, 0.6536186], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502]]) + y_gt = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) @@ -68,10 +75,13 @@ def test_fit_sample_object(): ratio=ratio, random_state=RND_SEED, estimator=cluster) X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.13347175, 0.12167502], [0.06738818, -0.529627], - [0.17901516, 0.69860992], [0.094035, -2.55298982]]) - y_gt = np.array([0, 0, 0, 1, 1, 1]) + X_gt = np.array([[0.06738818, -0.529627], + [0.17901516, 0.69860992], + [0.094035, -2.55298982], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.13347175, 0.12167502]]) + y_gt = np.array([1, 1, 1, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py index f7115176f..1d03eba9a 100644 --- a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py @@ -10,8 +10,11 @@ from collections import Counter import numpy as np + +from scipy.sparse import issparse + from sklearn.neighbors import KNeighborsClassifier -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from ..base import BaseCleaningSampler from ...utils.deprecation import deprecate_parameter @@ -155,18 +158,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -179,29 +183,27 @@ def _sample(self, X, y): random_state = check_random_state(self.random_state) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): # Randomly get one sample from the majority class # Generate the index to select - idx_maj_sample = random_state.randint( - low=0, high=target_stats[target_class], - size=self.n_seeds_S) - maj_sample = X[y == target_class][idx_maj_sample] + idx_maj = np.flatnonzero(y == target_class) + idx_maj_sample = idx_maj[random_state.randint( + low=0, high=target_stats[target_class], + size=self.n_seeds_S)] # Create the set C - One majority samples and all minority - C_x = np.append(X[y == class_minority], maj_sample, axis=0) - C_y = np.append(y[y == class_minority], - np.array([target_class] * self.n_seeds_S)) + C_indices = np.append(np.flatnonzero(y == class_minority), + idx_maj_sample) + C_x = safe_indexing(X, C_indices) + C_y = safe_indexing(y, C_indices) # Create the set S - all majority samples - S_x = X[y == target_class] - S_y = y[y == target_class] + S_indices = np.flatnonzero(y == target_class) + S_x = safe_indexing(X, S_indices) + S_y = safe_indexing(y, S_indices) # fit knn on C self.estimator_.fit(C_x, C_y) @@ -215,21 +217,21 @@ def _sample(self, X, y): continue # Classify on S - pred_y = self.estimator_.predict(x_sam.reshape(1, -1)) + if not issparse(x_sam): + x_sam = x_sam.reshape(1, -1) + pred_y = self.estimator_.predict(x_sam) # If the prediction do not agree with the true label # append it in C_x if y_sam != pred_y: # Keep the index for later - idx_maj_sample = np.append(idx_maj_sample, idx_sam) + idx_maj_sample = np.append(idx_maj_sample, + idx_maj[idx_sam]) # Update C - C_x = np.append(X[y == class_minority], - X[y == target_class][idx_maj_sample], - axis=0) - C_y = np.append(y[y == class_minority], - np.array([target_class] * - idx_maj_sample.size)) + C_indices = np.append(C_indices, idx_maj[idx_sam]) + C_x = safe_indexing(X, C_indices) + C_y = safe_indexing(y, C_indices) # fit a knn on C self.estimator_.fit(C_x, C_y) @@ -242,32 +244,14 @@ def _sample(self, X, y): np.append(idx_maj_sample, np.flatnonzero(pred_S_y == S_y))) - # Find the misclassified S_y - sel_x = S_x[idx_maj_sample, :] - sel_y = S_y[idx_maj_sample] - - # The indexes found are relative to the current class, we need - # to find the absolute value Build the array with the absolute - # position - abs_pos = np.flatnonzero(y == target_class) - idx_maj_sample = abs_pos[idx_maj_sample] - - # If we need to offer support for the indices selected - if self.return_indices: - idx_under = np.concatenate((idx_under, idx_maj_sample), - axis=0) - X_resampled = np.concatenate((X_resampled, sel_x), axis=0) - y_resampled = np.concatenate((y_resampled, sel_y), axis=0) + idx_under = np.concatenate((idx_under, idx_maj_sample), + axis=0) else: - X_resampled = np.concatenate( - (X_resampled, X[y == target_class]), axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class]), axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py index 179a5d321..7630b7cd5 100644 --- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py @@ -14,6 +14,8 @@ import numpy as np from scipy.stats import mode +from sklearn.utils import safe_indexing + from ..base import BaseCleaningSampler from ...utils import check_neighbors_object from ...utils.deprecation import deprecate_parameter @@ -159,18 +161,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -180,17 +183,15 @@ def _sample(self, X, y): """ self._validate_estimator() - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) self.nn_.fit(X) for target_class in np.unique(y): if target_class in self.ratio_.keys(): - X_class = X[y == target_class] - y_class = y[y == target_class] + target_class_indices = np.flatnonzero(y == target_class) + X_class = safe_indexing(X, target_class_indices) + y_class = safe_indexing(y, target_class_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] @@ -204,21 +205,15 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) class RepeatedEditedNearestNeighbours(BaseCleaningSampler): @@ -368,18 +363,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -591,18 +587,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) diff --git a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py index bfd6e3622..637323164 100644 --- a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py +++ b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py @@ -16,6 +16,7 @@ from sklearn.base import ClassifierMixin from sklearn.ensemble import RandomForestClassifier from sklearn.externals.six import string_types +from sklearn.utils import safe_indexing from ..base import BaseCleaningSampler @@ -203,18 +204,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -229,8 +231,10 @@ def _sample(self, X, y): probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: - X_train, X_test = X[train_index], X[test_index] - y_train, y_test = y[train_index], y[test_index] + X_train = safe_indexing(X, train_index) + X_test = safe_indexing(X, test_index) + y_train = safe_indexing(y, train_index) + y_test = safe_indexing(y, test_index) self.estimator_.fit(X_train, y_train) @@ -241,10 +245,7 @@ def _sample(self, X, y): for l, c in enumerate(y_test) ] - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): @@ -257,18 +258,12 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) diff --git a/imblearn/under_sampling/prototype_selection/nearmiss.py b/imblearn/under_sampling/prototype_selection/nearmiss.py index a9bf51476..4a5475317 100644 --- a/imblearn/under_sampling/prototype_selection/nearmiss.py +++ b/imblearn/under_sampling/prototype_selection/nearmiss.py @@ -11,6 +11,8 @@ import numpy as np +from sklearn.utils import safe_indexing + from ..base import BaseUnderSampler from ...utils import check_neighbors_object from ...utils.deprecation import deprecate_parameter @@ -154,10 +156,10 @@ def _selection_dist_based(self, Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Original samples. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Associated label to X. dist_vec : ndarray, shape (n_samples, ) @@ -174,13 +176,7 @@ def _selection_dist_based(self, Returns ------- - X_sel : ndarray, shape (num_samples, n_features) - Selected samples. - - y_sel : ndarray, shape (num_samples, ) - The associated label. - - idx_sel : ndarray, shape (num_samples, ) + idx_sel : ndarray, shape (num_samples,) The list of the indices of the selected samples. """ @@ -188,7 +184,9 @@ def _selection_dist_based(self, # Compute the distance considering the farthest neighbour dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors:], axis=1) - if dist_vec.shape[0] != X[y == key].shape[0]: + target_class_indices = np.flatnonzero(y == key) + if (dist_vec.shape[0] != safe_indexing(X, + target_class_indices).shape[0]): raise RuntimeError('The samples to be selected do not correspond' ' to the distance matrix given. Ensure that' ' both `X[y == key]` and `dist_vec` are' @@ -243,18 +241,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -264,21 +263,20 @@ def _sample(self, X, y): """ self._validate_estimator() - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) + minority_class_indices = np.flatnonzero(y == class_minority) - self.nn_.fit(X[y == class_minority]) + self.nn_.fit(safe_indexing(X, minority_class_indices)) for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] - X_class = X[y == target_class] - y_class = y[y == target_class] + target_class_indices = np.flatnonzero(y == target_class) + X_class = safe_indexing(X, target_class_indices) + y_class = safe_indexing(y, target_class_indices) if self.version == 1: dist_vec, idx_vec = self.nn_.kneighbors( @@ -295,10 +293,10 @@ def _sample(self, X, y): elif self.version == 3: self.nn_ver3_.fit(X_class) dist_vec, idx_vec = self.nn_ver3_.kneighbors( - X[y == class_minority]) + safe_indexing(X, minority_class_indices)) idx_vec_farthest = np.unique(idx_vec.reshape(-1)) - X_class_selected = X_class[idx_vec_farthest, :] - y_class_selected = y_class[idx_vec_farthest] + X_class_selected = safe_indexing(X_class, idx_vec_farthest) + y_class_selected = safe_indexing(y_class, idx_vec_farthest) dist_vec, idx_vec = self.nn_.kneighbors( X_class_selected, n_neighbors=self.nn_.n_neighbors) @@ -311,18 +309,12 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py index 45d19e34b..e9f16e6a8 100644 --- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py @@ -11,9 +11,11 @@ import numpy as np from scipy.stats import mode +from sklearn.utils import safe_indexing + from ..base import BaseCleaningSampler from .edited_nearest_neighbours import EditedNearestNeighbours -from ...utils import check_neighbors_object +from ...utils import check_neighbors_object, check_ratio SEL_KIND = ('all', 'mode') @@ -146,18 +148,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -166,7 +169,6 @@ def _sample(self, X, y): """ self._validate_estimator() - enn = EditedNearestNeighbours(ratio=self.ratio, return_indices=True, random_state=self.random_state, size_ngh=self.size_ngh, @@ -187,8 +189,9 @@ def _sample(self, X, y): (n_samples > X.shape[0] * self.threshold_cleaning))] self.nn_.fit(X) - X_class = X[y == class_minority] - y_class = y[y == class_minority] + class_minority_indices = np.flatnonzero(y == class_minority) + X_class = safe_indexing(X, class_minority_indices) + y_class = safe_indexing(y, class_minority_indices) nnhood_idx = self.nn_.kneighbors( X_class, return_distance=False)[:, 1:] nnhood_label = y[nnhood_idx] @@ -211,7 +214,9 @@ def _sample(self, X, y): index_target_class = np.flatnonzero(selected_samples) if self.return_indices: - return (X[index_target_class], y[index_target_class], + return (safe_indexing(X, index_target_class), + safe_indexing(y, index_target_class), index_target_class) else: - return X[index_target_class], y[index_target_class] + return (safe_indexing(X, index_target_class), + safe_indexing(y, index_target_class)) diff --git a/imblearn/under_sampling/prototype_selection/one_sided_selection.py b/imblearn/under_sampling/prototype_selection/one_sided_selection.py index 1545300a4..ebba708ab 100644 --- a/imblearn/under_sampling/prototype_selection/one_sided_selection.py +++ b/imblearn/under_sampling/prototype_selection/one_sided_selection.py @@ -10,7 +10,7 @@ import numpy as np from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from ..base import BaseCleaningSampler from .tomek_links import TomekLinks @@ -174,10 +174,7 @@ def _sample(self, X, y): target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): @@ -186,56 +183,42 @@ def _sample(self, X, y): idx_maj_sample = idx_maj[random_state.randint( low=0, high=target_stats[target_class], size=self.n_seeds_S)] - maj_sample = X[idx_maj_sample] + + minority_class_indices = np.flatnonzero(y == class_minority) + C_indices = np.append(minority_class_indices, idx_maj_sample) # create the set composed of all minority samples and one # sample from the current class. - C_x = np.append(X[y == class_minority], maj_sample, axis=0) - C_y = np.append(y[y == class_minority], [target_class] * - self.n_seeds_S) + C_x = safe_indexing(X, C_indices) + C_y = safe_indexing(y, C_indices) # create the set S with removing the seed from S # since that it will be added anyway idx_maj_extracted = np.delete(idx_maj, idx_maj_sample, axis=0) - S_x = X[idx_maj_extracted] - S_y = y[idx_maj_extracted] + S_x = safe_indexing(X, idx_maj_extracted) + S_y = safe_indexing(y, idx_maj_extracted) self.estimator_.fit(C_x, C_y) pred_S_y = self.estimator_.predict(S_x) - sel_x = S_x[np.flatnonzero(pred_S_y != S_y), :] - sel_y = S_y[np.flatnonzero(pred_S_y != S_y)] - if self.return_indices: - idx_tmp = idx_maj_extracted[ - np.flatnonzero(pred_S_y != S_y)] - idx_under = np.concatenate( - (idx_under, idx_maj_sample, idx_tmp), axis=0) - X_resampled = np.concatenate( - (X_resampled, maj_sample, sel_x), axis=0) - y_resampled = np.concatenate( - (y_resampled, [target_class] * self.n_seeds_S, sel_y), - axis=0) + S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) + idx_tmp = idx_maj_extracted[S_misclassified_indices] + idx_under = np.concatenate( + (idx_under, idx_maj_sample, idx_tmp), axis=0) else: - X_resampled = np.concatenate( - (X_resampled, X[y == target_class]), axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class]), axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)), axis=0) - - # find the nearest neighbour of every point - nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) - nn.fit(X_resampled) - nns = nn.kneighbors(X_resampled, return_distance=False)[:, 1] - - links = TomekLinks.is_tomek(y_resampled, nns, - [c for c in np.unique(y) - if (c != class_minority and - c in self.ratio_.keys())]) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)), axis=0) + + X_resampled = safe_indexing(X, idx_under) + y_resampled = safe_indexing(y, idx_under) + + # apply Tomek cleaning + tl = TomekLinks(ratio=self.ratio_, return_indices=True, + random_state=self.random_state) + X_cleaned, y_cleaned, idx_cleaned = tl.fit_sample(X_resampled, + y_resampled) + + idx_under = safe_indexing(idx_under, idx_cleaned) if self.return_indices: - return (X_resampled[np.logical_not(links)], - y_resampled[np.logical_not(links)], - idx_under[np.logical_not(links)]) + return (X_cleaned, y_cleaned, idx_under) else: - return (X_resampled[np.logical_not(links)], - y_resampled[np.logical_not(links)]) + return X_cleaned, y_cleaned diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index 6a3de5bef..e7a209fdd 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -7,7 +7,7 @@ from __future__ import division import numpy as np -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state, safe_indexing from ..base import BaseUnderSampler @@ -94,18 +94,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data to be sampled. + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -116,10 +117,7 @@ def _sample(self, X, y): """ random_state = check_random_state(self.random_state) - X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) - y_resampled = np.empty((0, ), dtype=y.dtype) - if self.return_indices: - idx_under = np.empty((0, ), dtype=int) + idx_under = np.empty((0, ), dtype=int) for target_class in np.unique(y): if target_class in self.ratio_.keys(): @@ -131,18 +129,12 @@ def _sample(self, X, y): else: index_target_class = slice(None) - X_resampled = np.concatenate( - (X_resampled, X[y == target_class][index_target_class]), - axis=0) - y_resampled = np.concatenate( - (y_resampled, y[y == target_class][index_target_class]), - axis=0) - if self.return_indices: - idx_under = np.concatenate( - (idx_under, np.flatnonzero(y == target_class)[ - index_target_class]), axis=0) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) if self.return_indices: - return X_resampled, y_resampled, idx_under + return (safe_indexing(X, idx_under), safe_indexing(y, idx_under), + idx_under) else: - return X_resampled, y_resampled + return safe_indexing(X, idx_under), safe_indexing(y, idx_under) diff --git a/imblearn/under_sampling/prototype_selection/tomek_links.py b/imblearn/under_sampling/prototype_selection/tomek_links.py index dba47ccb9..91b99f03b 100644 --- a/imblearn/under_sampling/prototype_selection/tomek_links.py +++ b/imblearn/under_sampling/prototype_selection/tomek_links.py @@ -9,6 +9,7 @@ import numpy as np from sklearn.neighbors import NearestNeighbors +from sklearn.utils import safe_indexing from ..base import BaseCleaningSampler @@ -143,18 +144,19 @@ def _sample(self, X, y): Parameters ---------- - X : ndarray, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. - y : ndarray, shape (n_samples, ) + y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- - X_resampled : ndarray, shape (n_samples_new, n_features) + X_resampled : {ndarray, sparse matrix}, shape \ +(n_samples_new, n_features) The array containing the resampled data. - y_resampled : ndarray, shape (n_samples_new) + y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` idx_under : ndarray, shape (n_samples, ) @@ -169,9 +171,12 @@ def _sample(self, X, y): nns = nn.kneighbors(X, return_distance=False)[:, 1] links = self.is_tomek(y, nns, self.ratio_) + idx_under = np.flatnonzero(np.logical_not(links)) if self.return_indices: - return (X[np.logical_not(links)], y[np.logical_not(links)], - np.flatnonzero(np.logical_not(links))) + return (safe_indexing(X, idx_under), + safe_indexing(y, idx_under), + idx_under) else: - return X[np.logical_not(links)], y[np.logical_not(links)] + return (safe_indexing(X, idx_under), + safe_indexing(y, idx_under)) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 45a5c0b4e..2df602a08 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -11,21 +11,27 @@ from collections import Counter +import pytest + import numpy as np +from scipy import sparse from pytest import raises from sklearn.datasets import make_classification +from sklearn.cluster import KMeans from sklearn.utils.estimator_checks import _yield_all_checks \ as sklearn_yield_all_checks, check_estimator \ as sklearn_check_estimator, check_parameters_default_constructible from sklearn.exceptions import NotFittedError - +from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import set_random_state from imblearn.base import SamplerMixin from imblearn.over_sampling.base import BaseOverSampler from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler from imblearn.ensemble.base import BaseEnsembleSampler +from imblearn.over_sampling import SMOTE +from imblearn.under_sampling import NearMiss, ClusterCentroids from imblearn.utils.testing import warns @@ -38,6 +44,8 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_fit yield check_samplers_fit_sample yield check_samplers_ratio_fit_sample + yield check_samplers_sparse + yield check_samplers_pandas def _yield_all_checks(name, Estimator): @@ -244,3 +252,63 @@ def check_samplers_ratio_fit_sample(name, Sampler): X_res, y_res = sampler.fit_sample(X, y) y_ensemble = y_res[0] assert Counter(y_ensemble)[1] == expected_stat + + +def check_samplers_sparse(name, Sampler): + # check that sparse matrices can be passed through the sampler leading to + # the same results than dense + X, y = make_classification(n_samples=1000, n_classes=3, + n_informative=4, weights=[0.2, 0.3, 0.5], + random_state=0) + X_sparse = sparse.csr_matrix(X) + if isinstance(Sampler(), SMOTE): + samplers = [Sampler(random_state=0, kind=kind) + for kind in ('regular', 'borderline1', + 'borderline2', 'svm')] + elif isinstance(Sampler(), NearMiss): + samplers = [Sampler(random_state=0, version=version) + for version in (1, 2, 3)] + elif isinstance(Sampler(), ClusterCentroids): + # set KMeans to full since it support sparse and dense + samplers = [Sampler(random_state=0, + estimator=KMeans(random_state=1, + algorithm='full'))] + else: + samplers = [Sampler(random_state=0)] + for sampler in samplers: + X_res_sparse, y_res_sparse = sampler.fit_sample(X_sparse, y) + X_res, y_res = sampler.fit_sample(X, y) + if not isinstance(sampler, BaseEnsembleSampler): + assert sparse.issparse(X_res_sparse) + assert_allclose(X_res_sparse.A, X_res) + assert_allclose(y_res_sparse, y_res) + else: + for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, + y_res_sparse, y_res): + assert sparse.issparse(x_sp) + assert_allclose(x_sp.A, x) + assert_allclose(y_sp, y) + + +def check_samplers_pandas(name, Sampler): + pd = pytest.importorskip("pandas") + # Check that the samplers handle pandas dataframe and pandas series + X, y = make_classification(n_samples=1000, n_classes=3, + n_informative=4, weights=[0.2, 0.3, 0.5], + random_state=0) + X_pd, y_pd = pd.DataFrame(X), pd.Series(y) + sampler = Sampler(random_state=0) + if isinstance(Sampler(), SMOTE): + samplers = [Sampler(random_state=0, kind=kind) + for kind in ('regular', 'borderline1', + 'borderline2', 'svm')] + elif isinstance(Sampler(), NearMiss): + samplers = [Sampler(random_state=0, version=version) + for version in (1, 2, 3)] + else: + samplers = [Sampler(random_state=0)] + for sampler in samplers: + X_res_pd, y_res_pd = sampler.fit_sample(X_pd, y_pd) + X_res, y_res = sampler.fit_sample(X, y) + assert_allclose(X_res_pd, X_res) + assert_allclose(y_res_pd, y_res)