diff --git a/circle.yml b/circle.yml index a8dde3168..4f815501e 100644 --- a/circle.yml +++ b/circle.yml @@ -23,7 +23,7 @@ dependencies: - sudo apt-get install build-essential python-dev python-setuptools # install numpy first as it is a compile time dependency for other packages - pip install --upgrade numpy - - pip install --upgrade scipy matplotlib setuptools nose coverage sphinx pillow sphinx-gallery sphinx_rtd_theme + - pip install --upgrade scipy matplotlib setuptools nose coverage pillow sphinx-gallery sphinx_rtd_theme sphinx==1.5.6 # Installing required packages for `make -C doc check command` to work. - sudo -E apt-get -yq update - sudo -E apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra diff --git a/doc/_static/css/imbalanced-learn.css b/doc/_static/css/imbalanced-learn.css index b0b6f781d..7242f2dad 100644 --- a/doc/_static/css/imbalanced-learn.css +++ b/doc/_static/css/imbalanced-learn.css @@ -2,4 +2,19 @@ .highlight a { text-decoration: underline; -} \ No newline at end of file +} + +.deprecated p { + padding: 10px 7px 10px 10px; + color: #b94a48; + background-color: #F3E5E5; + border: 1px solid #eed3d7; +} + +.deprecated p span.versionmodified { + font-weight: bold; +} + +.wy-nav-content { + max-width: 1200px !important; +} diff --git a/doc/api.rst b/doc/api.rst index ce2495c3e..4cd509ed2 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -77,8 +77,8 @@ Combination of over- and under-sampling methods =============================================== .. automodule:: imblearn.combine - :no-members: - :no-inherited-members: + :no-members: + :no-inherited-members: .. currentmodule:: imblearn @@ -174,3 +174,6 @@ Utilities :toctree: generated/ utils.estimator_checks.check_estimator + utils.check_neighbors_object + utils.check_ratio + utils.hash_X_y diff --git a/doc/conf.py b/doc/conf.py index bb8f113c4..a6da820b9 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -31,6 +31,15 @@ pass # -- General configuration ------------------------------------------------ + +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory +# is relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +sys.path.insert(0, os.path.abspath('sphinxext')) + +from github_link import make_linkcode_resolve + # If your documentation needs a minimal Sphinx version, state it here. # needs_sphinx = '1.0' @@ -39,9 +48,10 @@ # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', 'numpydoc', 'sphinx.ext.pngmath', 'sphinx.ext.ifconfig', + 'sphinx.ext.todo', 'sphinx.ext.pngmath', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', 'sphinx_gallery.gen_gallery', - 'sphinx.ext.autosummary' + 'sphinx.ext.autosummary', 'numpydoc', + 'sphinx_issues', 'sphinx.ext.linkcode' ] autosummary_generate = True @@ -294,6 +304,13 @@ def generate_example_rst(app, what, name, obj, options, lines): open(examples_path, 'w').close() +# Config for sphinx_issues + +issues_uri = 'https://github.com/scikit-learn-contrib/imbalanced-learn/issues/{issue}' +issues_github_path = 'scikit-learn-contrib/imbalanced-learn' +issues_user_uri = 'https://github.com/{user}' + + def setup(app): app.connect('autodoc-process-docstring', generate_example_rst) @@ -312,3 +329,9 @@ def setup(app): # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'http://docs.python.org/': None} + +# The following is used by sphinx.ext.linkcode to provide links to github +linkcode_resolve = make_linkcode_resolve('imblearn', + u'https://github.com/scikit-learn-contrib/' + 'imbalanced-learn/blob/{revision}/' + '{package}/{path}#L{lineno}') diff --git a/doc/sphinxext/LICENSE.txt b/doc/sphinxext/LICENSE.txt new file mode 100644 index 000000000..e00efc31e --- /dev/null +++ b/doc/sphinxext/LICENSE.txt @@ -0,0 +1,97 @@ +------------------------------------------------------------------------------- + The files + - numpydoc.py + - autosummary.py + - autosummary_generate.py + - docscrape.py + - docscrape_sphinx.py + - phantom_import.py + have the following license: + +Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + The files + - compiler_unparse.py + - comment_eater.py + - traitsdoc.py + have the following license: + +This software is OSI Certified Open Source Software. +OSI Certified is a certification mark of the Open Source Initiative. + +Copyright (c) 2006, Enthought, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Enthought, Inc. nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------- + The files + - only_directives.py + - plot_directive.py + originate from Matplotlib (http://matplotlib.sf.net/) which has + the following license: + +Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. + +1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. + +4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. + +5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. + diff --git a/doc/sphinxext/MANIFEST.in b/doc/sphinxext/MANIFEST.in new file mode 100644 index 000000000..f88ed785c --- /dev/null +++ b/doc/sphinxext/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include tests *.py +include *.txt diff --git a/doc/sphinxext/README.txt b/doc/sphinxext/README.txt new file mode 100644 index 000000000..455a709fb --- /dev/null +++ b/doc/sphinxext/README.txt @@ -0,0 +1,52 @@ +===================================== +numpydoc -- Numpy's Sphinx extensions +===================================== + +Numpy's documentation uses several custom extensions to Sphinx. These +are shipped in this ``numpydoc`` package, in case you want to make use +of them in third-party projects. + +The following extensions are available: + + - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add + the code description directives ``np-function``, ``np-cfunction``, etc. + that support the Numpy docstring syntax. + + - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. + + - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` + directive. Note that this implementation may still undergo severe + changes or eventually be deprecated. + + - ``numpydoc.only_directives``: (DEPRECATED) + + - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. + Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, + and it the Sphinx 1.0 version is recommended over that included in + Numpydoc. + + +numpydoc +======== + +Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings +following the Numpy/Scipy format to a form palatable to Sphinx. + +Options +------- + +The following options can be set in conf.py: + +- numpydoc_use_plots: bool + + Whether to produce ``plot::`` directives for Examples sections that + contain ``import matplotlib``. + +- numpydoc_show_class_members: bool + + Whether to show all members of a class in the Methods and Attributes + sections automatically. + +- numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) + + Whether to insert an edit link after docstrings. diff --git a/doc/sphinxext/github_link.py b/doc/sphinxext/github_link.py new file mode 100644 index 000000000..38d048687 --- /dev/null +++ b/doc/sphinxext/github_link.py @@ -0,0 +1,84 @@ +from operator import attrgetter +import inspect +import subprocess +import os +import sys +from functools import partial + +REVISION_CMD = 'git rev-parse --short HEAD' + + +def _get_git_revision(): + try: + revision = subprocess.check_output(REVISION_CMD.split()).strip() + except (subprocess.CalledProcessError, OSError): + print('Failed to execute git to get revision') + return None + return revision.decode('utf-8') + + +def _linkcode_resolve(domain, info, package, url_fmt, revision): + """Determine a link to online source for a class/method/function + + This is called by sphinx.ext.linkcode + + An example with a long-untouched module that everyone has + >>> _linkcode_resolve('py', {'module': 'tty', + ... 'fullname': 'setraw'}, + ... package='tty', + ... url_fmt='http://hg.python.org/cpython/file/' + ... '{revision}/Lib/{package}/{path}#L{lineno}', + ... revision='xxxx') + 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' + """ + + if revision is None: + return + if domain not in ('py', 'pyx'): + return + if not info.get('module') or not info.get('fullname'): + return + + class_name = info['fullname'].split('.')[0] + if type(class_name) != str: + # Python 2 only + class_name = class_name.encode('utf-8') + module = __import__(info['module'], fromlist=[class_name]) + obj = attrgetter(info['fullname'])(module) + + try: + fn = inspect.getsourcefile(obj) + except Exception: + fn = None + if not fn: + try: + fn = inspect.getsourcefile(sys.modules[obj.__module__]) + except Exception: + fn = None + if not fn: + return + + fn = os.path.relpath(fn, + start=os.path.dirname(__import__(package).__file__)) + try: + lineno = inspect.getsourcelines(obj)[1] + except Exception: + lineno = '' + return url_fmt.format(revision=revision, package=package, + path=fn, lineno=lineno) + + +def make_linkcode_resolve(package, url_fmt): + """Returns a linkcode_resolve function for the given URL format + + revision is a git commit reference (hash or name) + + package is the name of the root module of the package + + url_fmt is along the lines of ('https://github.com/USER/PROJECT/' + 'blob/{revision}/{package}/' + '{path}#L{lineno}') + """ + revision = _get_git_revision() + return partial(_linkcode_resolve, revision=revision, package=package, + url_fmt=url_fmt) diff --git a/doc/sphinxext/sphinx_issues.py b/doc/sphinxext/sphinx_issues.py new file mode 100644 index 000000000..f4b8c9346 --- /dev/null +++ b/doc/sphinxext/sphinx_issues.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +"""A Sphinx extension for linking to your project's issue tracker. + +Copyright 2014 Steven Loria + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from docutils import nodes, utils +from sphinx.util.nodes import split_explicit_title + +__version__ = '0.2.0' +__author__ = 'Steven Loria' +__license__ = 'MIT' + + +def user_role(name, rawtext, text, lineno, + inliner, options=None, content=None): + """Sphinx role for linking to a user profile. Defaults to linking to + Github profiles, but the profile URIS can be configured via the + ``issues_user_uri`` config value. + + Example: :: + + :user:`sloria` + """ + options = options or {} + content = content or [] + has_explicit_title, title, target = split_explicit_title(text) + + target = utils.unescape(target).strip() + title = utils.unescape(title).strip() + config = inliner.document.settings.env.app.config + if config.issues_user_uri: + ref = config.issues_user_uri.format(user=target) + else: + ref = 'https://github.com/{0}'.format(target) + if has_explicit_title: + text = title + else: + text = '@{0}'.format(target) + + link = nodes.reference(text=text, refuri=ref, **options) + return [link], [] + + +def _make_issue_node(issue_no, config, options=None): + options = options or {} + if issue_no not in ('-', '0'): + if config.issues_uri: + ref = config.issues_uri.format(issue=issue_no) + elif config.issues_github_path: + ref = 'https://github.com/{0}/issues/{1}'.format( + config.issues_github_path, issue_no + ) + issue_text = '#{0}'.format(issue_no) + link = nodes.reference(text=issue_text, refuri=ref, **options) + else: + link = None + return link + + +def issue_role(name, rawtext, text, lineno, + inliner, options=None, content=None): + """Sphinx role for linking to an issue. Must have + `issues_uri` or `issues_github_path` configured in ``conf.py``. + + Examples: :: + + :issue:`123` + :issue:`42,45` + """ + options = options or {} + content = content or [] + issue_nos = [each.strip() for each in utils.unescape(text).split(',')] + config = inliner.document.settings.env.app.config + ret = [] + for i, issue_no in enumerate(issue_nos): + node = _make_issue_node(issue_no, config, options=options) + ret.append(node) + if i != len(issue_nos) - 1: + sep = nodes.raw(text=', ', format='html') + ret.append(sep) + return ret, [] + + +def setup(app): + # Format template for issues URI + # e.g. 'https://github.com/sloria/marshmallow/issues/{issue} + app.add_config_value('issues_uri', default=None, rebuild='html') + # Shortcut for Github, e.g. 'sloria/marshmallow' + app.add_config_value('issues_github_path', default=None, rebuild='html') + # Format template for user profile URI + # e.g. 'https://github.com/{user}' + app.add_config_value('issues_user_uri', default=None, rebuild='html') + app.add_role('issue', issue_role) + app.add_role('user', user_role) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index eb467c528..b5f7cc2f2 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -14,6 +14,8 @@ Bug fixes - Fixed a bug in :class:`under_sampling.NearMiss` version 3. The indices returned were wrong. By `Guillaume Lemaitre`_. +- fixed bug for :class:`ensemble.BalanceCascade` and :class:`combine.SMOTEENN` + and :class:`SMOTETomek. By `Guillaume Lemaitre`_.` New features ~~~~~~~~~~~~ @@ -32,6 +34,7 @@ Enhancement `Guillaume Lemaitre`_ - Remove seaborn dependence and improve the examples. By `Guillaume Lemaitre`_. +- adapt all classes to multi-class resampling. By `Guillaume Lemaitre`_ API changes summary ~~~~~~~~~~~~~~~~~~~ @@ -45,7 +48,14 @@ API changes summary - move the under-sampling methods in `prototype_selection` and `prototype_generation` submodule to make a clearer dinstinction. By `Guillaume Lemaitre`_. +- change `ratio` such that it can adapt to multiple class problems. By + `Guillaume Lemaitre`_. + +Deprecation +~~~~~~~~~~~ +- deprecate the use of float as ratio in favor of dictionary, string, or + callable. By `Guillaume Lemaitre`_. .. _changes_0_2: diff --git a/imblearn/base.py b/imblearn/base.py index b9aedd561..a22c8e0e0 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -7,21 +7,18 @@ from __future__ import division import logging -import warnings -from numbers import Real from abc import ABCMeta, abstractmethod -from collections import Counter -import numpy as np from sklearn.base import BaseEstimator from sklearn.externals import six -from sklearn.utils import check_X_y -from sklearn.utils.multiclass import type_of_target +from sklearn.utils import check_X_y, check_random_state from sklearn.utils.validation import check_is_fitted +from .utils import check_ratio, check_target_type, hash_X_y + class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): - """Mixin class for samplers with abstact method. + """Mixin class for samplers with abstract method. Warning: This class should not be used directly. Use the derive classes instead. @@ -29,76 +26,12 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = 'sampler' - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - # Check the consistency of X and y - X, y = check_X_y(X, y) - - self.min_c_ = None - self.maj_c_ = None - self.stats_c_ = {} - self.X_shape_ = None - - if hasattr(self, 'ratio'): - self._validate_ratio() - - if hasattr(self, 'size_ngh'): - self._validate_size_ngh_deprecation() - elif hasattr(self, 'k') and not hasattr(self, 'm'): - self._validate_k_deprecation() - elif hasattr(self, 'k') and hasattr(self, 'm'): - self._validate_k_m_deprecation() - - self.logger.info('Compute classes statistics ...') - - # Raise an error if there is only one class - if np.unique(y).size <= 1: - raise ValueError("Sampler can't balance when only one class is" - " present.") - - # Store the size of X to check at sampling time if we have the - # same data - self.X_shape_ = X.shape - - # Create a dictionary containing the class statistics - self.stats_c_ = Counter(y) - - # Find the minority and majority classes - self.min_c_ = min(self.stats_c_, key=self.stats_c_.get) - self.maj_c_ = max(self.stats_c_, key=self.stats_c_.get) - - self.logger.info('%s classes detected: %s', - np.unique(y).size, self.stats_c_) - - # Check if the ratio provided at initialisation make sense - if isinstance(self.ratio, Real): - if self.ratio < (self.stats_c_[self.min_c_] / - self.stats_c_[self.maj_c_]): - raise RuntimeError('The ratio requested at initialisation' - ' should be greater or equal than the' - ' balancing ratio of the current data.' - ' Got {} < {}.'.format( - self.ratio, - self.stats_c_[self.min_c_] / - self.stats_c_[self.maj_c_])) - - return self + def _check_X_y(self, X, y): + """Private function to check that the X and y in fitting are the same + than in sampling.""" + X_hash, y_hash = hash_X_y(X, y) + if self.X_hash_ != X_hash or self.y_hash_ != y_hash: + raise RuntimeError("X and y need to be same array earlier fitted.") def sample(self, X, y): """Resample the dataset. @@ -124,25 +57,8 @@ def sample(self, X, y): # Check the consistency of X and y X, y = check_X_y(X, y) - # Check that the data have been fitted - check_is_fitted(self, 'stats_c_') - - # Check if the size of the data is identical than at fitting - if X.shape != self.X_shape_: - raise RuntimeError('The data that you attempt to resample do not' - ' seem to be the one earlier fitted. Use the' - ' fitted data. Shape of data is {}, got {}' - ' instead.'.format(X.shape, self.X_shape_)) - - if hasattr(self, 'ratio'): - self._validate_ratio() - - if hasattr(self, 'size_ngh'): - self._validate_size_ngh_deprecation() - elif hasattr(self, 'k') and not hasattr(self, 'm'): - self._validate_k_deprecation() - elif hasattr(self, 'k') and hasattr(self, 'm'): - self._validate_k_m_deprecation() + check_is_fitted(self, 'ratio_') + self._check_X_y(X, y) return self._sample(X, y) @@ -169,56 +85,6 @@ def fit_sample(self, X, y): return self.fit(X, y).sample(X, y) - def _validate_ratio(self): - # The ratio correspond to the number of samples in the minority class - # over the number of samples in the majority class. Thus, the ratio - # cannot be greater than 1.0 - if isinstance(self.ratio, Real): - if self.ratio > 1: - raise ValueError('Ratio cannot be greater than one.' - ' Got {}.'.format(self.ratio)) - elif self.ratio <= 0: - raise ValueError('Ratio cannot be negative.' - ' Got {}.'.format(self.ratio)) - - elif isinstance(self.ratio, six.string_types): - if self.ratio != 'auto': - raise ValueError("Unknown string for the parameter ratio." - " Got {} instead of 'auto'".format( - self.ratio)) - else: - raise ValueError('Unknown parameter type for ratio.' - ' Got {} instead of float or str'.format( - type(self.ratio))) - - def _validate_size_ngh_deprecation(self): - "Private function to warn about the deprecation about size_ngh." - - # Announce deprecation if necessary - if self.size_ngh is not None: - warnings.warn('`size_ngh` will be replaced in version 0.4. Use' - ' `n_neighbors` instead.', DeprecationWarning) - self.n_neighbors = self.size_ngh - - def _validate_k_deprecation(self): - """Private function to warn about deprecation of k in ADASYN""" - if self.k is not None: - warnings.warn('`k` will be replaced in version 0.4. Use' - ' `n_neighbors` instead.', DeprecationWarning) - self.n_neighbors = self.k - - def _validate_k_m_deprecation(self): - """Private function to warn about deprecation of k in ADASYN""" - if self.k is not None: - warnings.warn('`k` will be replaced in version 0.4. Use' - ' `k_neighbors` instead.', DeprecationWarning) - self.k_neighbors = self.k - - if self.m is not None: - warnings.warn('`m` will be replaced in version 0.4. Use' - ' `m_neighbors` instead.', DeprecationWarning) - self.m_neighbors = self.m - @abstractmethod def _sample(self, X, y): """Resample the dataset. @@ -254,105 +120,17 @@ def __setstate__(self, dict): self.logger = logger -class BaseBinarySampler(six.with_metaclass(ABCMeta, SamplerMixin)): - """Base class for all binary class sampler. - - Warning: This class should not be used directly. Use derived classes - instead. - - """ - - def __init__(self, ratio='auto', random_state=None): - """Initialize this object and its instance variables. - - Parameters - ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balanced - the dataset. Otherwise, the ratio will corresponds to the number - of samples in the minority class over the the number of samples - in the majority class. - - random_state : int, RandomState or None, optional (default=None) - - - If int, random_state is the seed used by the random number - generator; - - If RandomState instance, random_state is the random number - generator; - - If None, the random number generator is the RandomState instance - used by np.random. - - Returns - ------- - None - - """ - self.ratio = ratio - self.random_state = random_state - self.logger = logging.getLogger(__name__) - - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(BaseBinarySampler, self).fit(X, y) - - # Check that the target type is binary - if not type_of_target(y) == 'binary': - warnings.simplefilter('always', UserWarning) - warnings.warn('The target type should be binary.') - - return self - +class BaseSampler(SamplerMixin): + """Base class for sampling algorithms. -class BaseMulticlassSampler(six.with_metaclass(ABCMeta, SamplerMixin)): - """Base class for all multiclass sampler. - - Warning: This class should not be used directly. Use derived classes + Warning: This class should not be used directly. Use the derive classes instead. - """ - def __init__(self, ratio='auto', random_state=None): - """Initialize this object and its instance variables. - Parameters - ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balanced - the dataset. Otherwise, the ratio will corresponds to the number - of samples in the minority class over the the number of samples - in the majority class. - - random_state : int, RandomState or None, optional (default=None) - - - If int, random_state is the seed used by the random number - generator; - - If RandomState instance, random_state is the random number - generator; - - If None, the random number generator is the RandomState instance - used by np.random. - - Returns - ------- - None - - """ + def __init__(self, ratio='auto', random_state=None, sampling_type=None): self.ratio = ratio self.random_state = random_state + self.sampling_type = sampling_type self.logger = logging.getLogger(__name__) def fit(self, X, y): @@ -372,13 +150,10 @@ def fit(self, X, y): Return self. """ - - super(BaseMulticlassSampler, self).fit(X, y) - - # Check that the target type is either binary or multiclass - if not (type_of_target(y) == 'binary' or - type_of_target(y) == 'multiclass'): - warnings.simplefilter('always', UserWarning) - warnings.warn('The target type should be binary or multiclass.') + X, y = check_X_y(X, y) + y = check_target_type(y) + self.X_hash_, self.y_hash_ = hash_X_y(X, y) + # self.sampling_type is already checked in check_ratio + self.ratio_ = check_ratio(self.ratio, y, self._sampling_type) return self diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py index c1237a1a5..92e30ac74 100644 --- a/imblearn/combine/smote_enn.py +++ b/imblearn/combine/smote_enn.py @@ -4,120 +4,135 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division +import logging import warnings -from ..base import BaseBinarySampler +from sklearn.utils import check_X_y + +from ..base import SamplerMixin from ..over_sampling import SMOTE from ..under_sampling import EditedNearestNeighbours +from ..utils import check_target_type, hash_X_y -class SMOTEENN(BaseBinarySampler): +class SMOTEENN(SamplerMixin): """Class to perform over-sampling using SMOTE and cleaning using ENN. Combine over- and under-sampling using SMOTE and Edited Nearest Neighbours. Parameters ---------- - ratio : str or float, optional (default=None) - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the - number of samples in the minority class over the the number of - samples in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. smote : object, optional (default=SMOTE()) - The SMOTE object to use. If not given, a SMOTE object with default - parameters will be given. + The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, + a :class:`imblearn.over_sampling.SMOTE` object with default parameters + will be given. enn : object, optional (default=EditedNearestNeighbours()) - The ENN object to use. If not given, an EditedNearestNeighbours object - with default parameters will be given. + The :class:`imblearn.under_sampling.EditedNearestNeighbours` object to + use. If not given, an + :class:`imblearn.under_sampling.EditedNearestNeighbours` object with + default parameters will be given. k : int, optional (default=None) Number of nearest neighbours to used to construct synthetic samples. - NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + `k` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. m : int, optional (default=None) Number of nearest neighbours to use to determine if a minority sample is in danger. - NOTE: `m` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + `m` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. out_step : float, optional (default=None) Step size when extrapolating. - NOTE: `out_step` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + ``out_step`` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. kind_smote : str, optional (default=None) The type of SMOTE algorithm to use one of the following - options: 'regular', 'borderline1', 'borderline2', 'svm'. + options: ``'regular'``, ``'borderline1'``, ``'borderline2'``, + ``'svm'``. - NOTE: `kind_smote` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + `kind_smote` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + size_ngh is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. n_neighbors : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: `n_neighbors` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a EditedNearestNeighbours object. + .. deprecated:: 0.2 + `n_neighbors` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a + :class:`imblearn.under_sampling.EditedNearestNeighbours` object. kind_sel : str, optional (default=None) Strategy to use in order to exclude samples. - - If 'all', all neighbours will have to agree with the samples of - interest to not be excluded. - - If 'mode', the majority vote of the neighbours will be used in - order to exclude a sample. + - If ``'all'``, all neighbours will have to agree with the samples of + interest to not be excluded. + - If ``'mode'``, the majority vote of the neighbours will be used in + order to exclude a sample. - NOTE: `kind_sel` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a EditedNearestNeighbours object. + .. deprecated:: 0.2 + ``kind_sel`` is deprecated from 0.2 and will be replaced in 0.4 Give + directly a :class:`imblearn.under_sampling.EditedNearestNeighbours` + object. n_jobs : int, optional (default=None) The number of threads to open if possible. - NOTE: `n_jobs` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE and EditedNearestNeighbours object. - - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. + .. deprecated:: 0.2 + `n_jobs` is deprecated from 0.2 and will be replaced in 0.4 Give + directly a :class:`imblearn.over_sampling.SMOTE` and + :class:`imblearn.under_sampling.EditedNearestNeighbours` object. Notes ----- The method is presented in [1]_. - This class does not support mutli-class. + Supports mutli-class resampling. Examples -------- @@ -156,8 +171,9 @@ def __init__(self, n_neighbors=None, kind_enn=None, n_jobs=None): - - super(SMOTEENN, self).__init__(ratio=ratio, random_state=random_state) + super(SMOTEENN, self).__init__() + self.ratio = ratio + self.random_state = random_state self.smote = smote self.enn = enn self.k = k @@ -168,6 +184,7 @@ def __init__(self, self.n_neighbors = n_neighbors self.kind_enn = kind_enn self.n_jobs = n_jobs + self.logger = logging.getLogger(__name__) def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" @@ -230,6 +247,7 @@ def _validate_estimator(self): if self.n_jobs is None: self.n_jobs = 1 self.enn_ = EditedNearestNeighbours( + ratio='all', random_state=self.random_state, size_ngh=self.size_ngh, n_neighbors=self.n_neighbors, @@ -244,7 +262,8 @@ def _validate_estimator(self): ' Got {} instead.'.format(type(self.enn))) # Otherwise create a default EditedNearestNeighbours else: - self.enn_ = EditedNearestNeighbours(random_state=self.random_state) + self.enn_ = EditedNearestNeighbours(ratio='all', + random_state=self.random_state) def fit(self, X, y): """Find the classes statistics before to perform sampling. @@ -263,13 +282,10 @@ def fit(self, X, y): Return self. """ - - super(SMOTEENN, self).fit(X, y) - - self._validate_estimator() - - # Fit using SMOTE - self.smote_.fit(X, y) + X, y = check_X_y(X, y) + y = check_target_type(y) + self.ratio_ = self.ratio + self.X_hash_, self.y_hash_ = hash_X_y(X, y) return self @@ -293,9 +309,7 @@ def _sample(self, X, y): The corresponding label of `X_resampled` """ + self._validate_estimator() - # Transform using SMOTE - X, y = self.smote_.sample(X, y) - - # Fit and transform using ENN - return self.enn_.fit_sample(X, y) + X_res, y_res = self.smote_.fit_sample(X, y) + return self.enn_.fit_sample(X_res, y_res) diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py index 3b68b0d7b..fc1e68e04 100644 --- a/imblearn/combine/smote_tomek.py +++ b/imblearn/combine/smote_tomek.py @@ -5,16 +5,20 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division +import logging import warnings -from ..base import BaseBinarySampler +from sklearn.utils import check_X_y + +from ..base import SamplerMixin from ..over_sampling import SMOTE from ..under_sampling import TomekLinks +from ..utils import check_target_type, hash_X_y -class SMOTETomek(BaseBinarySampler): +class SMOTETomek(SamplerMixin): """Class to perform over-sampling using SMOTE and cleaning using Tomek links. @@ -22,79 +26,83 @@ class SMOTETomek(BaseBinarySampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the - number of samples in the minority class over the the number of - samples in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. smote : object, optional (default=SMOTE()) - The SMOTE object to use. If not given, a SMOTE object with default - parameters will be given. + The :class:`imblearn.over_sampling.SMOTE` object to use. If not given, + a :class:`imblearn.over_sampling.SMOTE` object with default parameters + will be given. tomek : object, optional (default=Tomek()) - The Tomek object to use. If not given, a Tomek object with default - parameters will be given. + The :class:`imblearn.under_sampling.Tomek` object to use. If not given, + a :class:`imblearn.under_sampling.Tomek` object with default parameters + will be given. k : int, optional (default=None) Number of nearest neighbours to used to construct synthetic samples. - NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + ``k`` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. m : int, optional (default=None) Number of nearest neighbours to use to determine if a minority sample is in danger. - NOTE: `m` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + ``m`` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. out_step : float, optional (default=None) Step size when extrapolating. - NOTE: `out_step` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + `out_step` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. kind_smote : str, optional (default=None) The type of SMOTE algorithm to use one of the following - options: 'regular', 'borderline1', 'borderline2', 'svm'. + options: ``'regular'``, ``'borderline1'``, ``'borderline2'``, + ``'svm'``. - NOTE: `kind_smote` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. + .. deprecated:: 0.2 + ``kind_smote` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. n_jobs : int, optional (default=None) The number of threads to open if possible. - NOTE: `n_jobs` is deprecated from 0.2 and will be replaced in 0.4 - Give directly a SMOTE object. - - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. + .. deprecated:: 0.2 + ``n_jobs`` is deprecated from 0.2 and will be replaced in 0.4 + Give directly a :class:`imblearn.over_sampling.SMOTE` object. Notes ----- The methos is presented in [1]_. - This class does not support mutli-class. + Supports mutli-class resampling. Examples -------- @@ -102,7 +110,7 @@ class SMOTETomek(BaseBinarySampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.combine import \ - SMOTETomek # doctest: +NORMALIZE_WHITESPACE +SMOTETomek # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -130,8 +138,9 @@ def __init__(self, out_step=None, kind_smote=None, n_jobs=None): - super(SMOTETomek, self).__init__( - ratio=ratio, random_state=random_state) + super(SMOTETomek, self).__init__() + self.ratio = ratio + self.random_state = random_state self.smote = smote self.tomek = tomek self.k = k @@ -139,6 +148,7 @@ def __init__(self, self.out_step = out_step self.kind_smote = kind_smote self.n_jobs = n_jobs + self.logger = logging.getLogger(__name__) def _validate_estimator(self): "Private function to validate SMOTE and ENN objects" @@ -191,8 +201,9 @@ def _validate_estimator(self): warnings.warn('Parameters initialization will be replaced in' ' version 0.4. Use a ENN object instead.', DeprecationWarning) - self.tomek_ = TomekLinks( - random_state=self.random_state, n_jobs=self.n_jobs) + self.tomek_ = TomekLinks(ratio='all', + random_state=self.random_state, + n_jobs=self.n_jobs) # If an object was given, affect elif self.tomek is not None: if isinstance(self.tomek, TomekLinks): @@ -202,7 +213,8 @@ def _validate_estimator(self): 'Got {} instead.'.format(type(self.tomek))) # Otherwise create a default TomekLinks else: - self.tomek_ = TomekLinks(random_state=self.random_state) + self.tomek_ = TomekLinks(ratio='all', + random_state=self.random_state) def fit(self, X, y): """Find the classes statistics before to perform sampling. @@ -221,13 +233,10 @@ def fit(self, X, y): Return self. """ - - super(SMOTETomek, self).fit(X, y) - - self._validate_estimator() - - # Fit using SMOTE - self.smote_.fit(X, y) + X, y = check_X_y(X, y) + y = check_target_type(y) + self.ratio_ = self.ratio + self.X_hash_, self.y_hash_ = hash_X_y(X, y) return self @@ -251,9 +260,7 @@ def _sample(self, X, y): The corresponding label of `X_resampled` """ + self._validate_estimator() - # Transform using SMOTE - X, y = self.smote_.sample(X, y) - - # Fit and transform using ENN - return self.tomek_.fit_sample(X, y) + X_res, y_res = self.smote_.fit_sample(X, y) + return self.tomek_.fit_sample(X_res, y_res) diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py index e79e1c6c6..909c13776 100644 --- a/imblearn/combine/tests/test_smote_enn.py +++ b/imblearn/combine/tests/test_smote_enn.py @@ -1,4 +1,8 @@ """Test the module SMOTE ENN.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -9,7 +13,6 @@ from imblearn.under_sampling import EditedNearestNeighbours from imblearn.over_sampling import SMOTE -# Generate a global dataset to use RND_SEED = 0 X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], @@ -26,103 +29,80 @@ def test_sample_regular(): - # Create the object smote = SMOTEENN(random_state=RND_SEED) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [0.29307743, -0.14670439], [0.84976473, -0.15570176], - [0.61319159, -0.11571668], [0.66052536, -0.28246517], - [-0.28162401, -2.10400981], [0.83680821, 1.72827342], + X_gt = np.array([[1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): - # Create the object ratio = 0.8 smote = SMOTEENN(ratio=ratio, random_state=RND_SEED) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [0.36784496, -0.1953161], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) + X_gt = np.array([[1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.08711622, 0.93259929]]) + y_gt = np.array([0, 1, 1, 1]) assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_init(): - # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) - enn = EditedNearestNeighbours(random_state=RND_SEED) - + enn = EditedNearestNeighbours(random_state=RND_SEED, ratio='all') smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) - - X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [0.29307743, -0.14670439], [0.84976473, -0.15570176], - [0.61319159, -0.11571668], [0.66052536, -0.28246517], - [-0.28162401, -2.10400981], [0.83680821, 1.72827342], + X_gt = np.array([[1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_default(): smt = SMOTEENN(random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) - - X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [0.29307743, -0.14670439], [0.84976473, -0.15570176], - [0.61319159, -0.11571668], [0.66052536, -0.28246517], - [-0.28162401, -2.10400981], [0.83680821, 1.72827342], + X_gt = np.array([[1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.08711622, 0.93259929]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) - + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_deprecation(): - X_gt = np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [0.29307743, -0.14670439], [0.84976473, -0.15570176], - [0.61319159, -0.11571668], [0.66052536, -0.28246517], - [-0.28162401, -2.10400981], [0.83680821, 1.72827342], - [0.08711622, 0.93259929]]) - y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) - smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) + X_gt = np.array([[1.52091956, -0.49283504], + [0.84976473, -0.15570176], + [0.61319159, -0.11571667], + [0.66052536, -0.28246518], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.08711622, 0.93259929]]) + y_gt = np.array([0, 0, 0, 0, 1, 1, 1]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) - smt = SMOTEENN(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_allclose(X_resampled, X_gt, rtol=R_TOL) @@ -130,13 +110,11 @@ def test_validate_estimator_deprecation(): def test_error_wrong_object(): - # Create a SMOTE and Tomek object smote = 'rnd' enn = 'rnd' - smt = SMOTEENN(smote=smote, random_state=RND_SEED) assert_raises_regex(ValueError, "smote needs to be a SMOTE", - smt.fit, X, Y) + smt.fit_sample, X, Y) smt = SMOTEENN(enn=enn, random_state=RND_SEED) assert_raises_regex(ValueError, "enn needs to be an ", - smt.fit, X, Y) + smt.fit_sample, X, Y) diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py index 17d044cf3..ba0f3a47d 100644 --- a/imblearn/combine/tests/test_smote_tomek.py +++ b/imblearn/combine/tests/test_smote_tomek.py @@ -1,4 +1,8 @@ """Test the module SMOTE ENN.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -9,7 +13,6 @@ from imblearn.over_sampling import SMOTE from imblearn.under_sampling import TomekLinks -# Generate a global dataset to use RND_SEED = 0 X = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], @@ -26,117 +29,123 @@ def test_sample_regular(): - # Create the object smote = SMOTETomek(random_state=RND_SEED) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - - X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234], - [0.38307743, -0.05670439], [0.93976473, -0.06570176], - [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) + X_gt = np.array([[0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.38307743, -0.05670439], + [0.70319159, -0.02571667], + [0.75052536, -0.19246518]]) + y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_sample_regular_half(): - # Create the object ratio = 0.8 smote = SMOTETomek(ratio=ratio, random_state=RND_SEED) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - - X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234], + X_gt = np.array([[0.68481731, 0.51935141], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], [0.45784496, -0.1053161]]) - y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0]) + y_gt = np.array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_init(): - # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) - tomek = TomekLinks(random_state=RND_SEED) - + tomek = TomekLinks(random_state=RND_SEED, ratio='all') smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) - - X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234], - [0.38307743, -0.05670439], [0.93976473, -0.06570176], - [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) + X_gt = np.array([[0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.38307743, -0.05670439], + [0.70319159, -0.02571667], + [0.75052536, -0.19246518]]) + y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_default(): smt = SMOTETomek(random_state=RND_SEED) - X_resampled, y_resampled = smt.fit_sample(X, Y) - - X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234], - [0.38307743, -0.05670439], [0.93976473, -0.06570176], - [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) + X_gt = np.array([[0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.38307743, -0.05670439], + [0.70319159, -0.02571667], + [0.75052536, -0.19246518]]) + y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) def test_validate_estimator_deprecation(): - X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], - [1.34192108, -0.13367336], [0.62366841, -0.21312976], - [1.61091956, -0.40283504], [-0.37162401, -2.19400981], - [0.74680821, 1.63827342], [0.61472253, -0.82309052], - [0.19893132, -0.47761769], [0.97407872, 0.44454207], - [1.40301027, -0.83648734], [-1.20515198, -1.02689695], - [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], - [-0.00288378, 0.84259929], [1.79580611, -0.02219234], - [0.38307743, -0.05670439], [0.93976473, -0.06570176], - [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) - smt = SMOTETomek(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) + X_gt = np.array([[0.68481731, 0.51935141], + [1.34192108, -0.13367336], + [0.62366841, -0.21312976], + [1.61091956, -0.40283504], + [-0.37162401, -2.19400981], + [0.74680821, 1.63827342], + [0.61472253, -0.82309052], + [0.19893132, -0.47761769], + [1.40301027, -0.83648734], + [-1.20515198, -1.02689695], + [-0.23374509, 0.18370049], + [-0.00288378, 0.84259929], + [1.79580611, -0.02219234], + [0.38307743, -0.05670439], + [0.70319159, -0.02571667], + [0.75052536, -0.19246518]]) + y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) - smt = SMOTETomek(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_allclose(X_resampled, X_gt, rtol=R_TOL) @@ -144,13 +153,11 @@ def test_validate_estimator_deprecation(): def test_error_wrong_object(): - # Create a SMOTE and Tomek object smote = 'rnd' tomek = 'rnd' - smt = SMOTETomek(smote=smote, random_state=RND_SEED) assert_raises_regex(ValueError, "smote needs to be a SMOTE", - smt.fit, X, Y) + smt.fit_sample, X, Y) smt = SMOTETomek(tomek=tomek, random_state=RND_SEED) assert_raises_regex(ValueError, "tomek needs to be a TomekLinks", - smt.fit, X, Y) + smt.fit_sample, X, Y) diff --git a/imblearn/datasets/tests/test_make_imbalance.py b/imblearn/datasets/tests/test_make_imbalance.py index ad3e7b474..78569326e 100644 --- a/imblearn/datasets/tests/test_make_imbalance.py +++ b/imblearn/datasets/tests/test_make_imbalance.py @@ -1,4 +1,8 @@ """Test the module easy ensemble.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function diff --git a/imblearn/datasets/tests/test_zenodo.py b/imblearn/datasets/tests/test_zenodo.py index 2fb938d19..2ebd928d1 100644 --- a/imblearn/datasets/tests/test_zenodo.py +++ b/imblearn/datasets/tests/test_zenodo.py @@ -2,6 +2,10 @@ Skipped if datasets is not already downloaded to data_home. """ +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from imblearn.datasets import fetch_datasets from sklearn.utils.testing import (assert_equal, assert_allclose, assert_raises_regex, SkipTest) diff --git a/imblearn/ensemble/balance_cascade.py b/imblearn/ensemble/balance_cascade.py index 74bfd3109..17bf08123 100644 --- a/imblearn/ensemble/balance_cascade.py +++ b/imblearn/ensemble/balance_cascade.py @@ -4,22 +4,23 @@ # Christos Aridas # License: MIT -from __future__ import print_function - import warnings +from collections import Counter + import numpy as np from sklearn.base import ClassifierMixin from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state -from sklearn.utils.validation import has_fit_parameter from sklearn.externals.six import string_types +from sklearn.model_selection import cross_val_predict -from ..base import BaseBinarySampler +from .base import BaseEnsembleSampler +from ..utils import check_ratio -class BalanceCascade(BaseBinarySampler): +class BalanceCascade(BaseEnsembleSampler): """Create an ensemble of balanced sets by iteratively under-sampling the imbalanced dataset using an estimator. @@ -28,21 +29,32 @@ class BalanceCascade(BaseBinarySampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. return_indices : bool, optional (default=True) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. n_max_subset : int or None, optional (default=None) Maximum number of subsets to generate. By default, all data from @@ -51,16 +63,17 @@ class BalanceCascade(BaseBinarySampler): classifier : str, optional (default=None) The classifier that will be selected to confront the prediction - with the real labels. The choices are the following: 'knn', - 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting' - and 'linear-svm'. + with the real labels. The choices are the following: ``'knn'``, + ``'decision-tree'``, ``'random-forest'``, ``'adaboost'``, + ``'gradient-boosting'``, and ``'linear-svm'``. - NOTE: `classifier` is deprecated from 0.2 and will be replaced in 0.4. - Use `estimator` instead. + .. deprecated:: 0.2 + ``classifier`` is deprecated from 0.2 and will be replaced in 0.4. + Use ``estimator`` instead. estimator : object, optional (default=KNeighborsClassifier()) - An estimator inherited from `sklearn.base.ClassifierMixin` and having - an attribute `predict_proba`. + An estimator inherited from :class:`sklearn.base.ClassifierMixin` and + having an attribute :func:`predict_proba`. bootstrap : bool, optional (default=True) Whether to bootstrap the data before each iteration. @@ -68,30 +81,16 @@ class BalanceCascade(BaseBinarySampler): **kwargs : keywords The parameters associated with the classifier provided. - NOTE: `**kwargs` has been deprecated from 0.2 and will be replaced in - 0.4. Use `estimator` object instead to pass parameters associated - to an estimator. - - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. + .. deprecated:: 0.2 + ``**kwargs`` has been deprecated from 0.2 and will be replaced in + 0.4. Use ``estimator`` object instead to pass parameters associated + to an estimator. Notes ----- The method is described in [1]_. - This class does not support multi-class. + Supports mutli-class resampling. Examples -------- @@ -99,7 +98,7 @@ class BalanceCascade(BaseBinarySampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.ensemble import \ - BalanceCascade # doctest: +NORMALIZE_WHITESPACE +BalanceCascade # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -107,8 +106,9 @@ class BalanceCascade(BaseBinarySampler): Original dataset shape Counter({1: 900, 0: 100}) >>> bc = BalanceCascade(random_state=42) >>> X_res, y_res = bc.fit_sample(X, y) - >>> print('Resampled dataset shape {}'.format(Counter(y_res[0]))) - Resampled dataset shape Counter({0: 100, 1: 100}) + >>> print('Resampled dataset shape {}'.format(Counter(y_res[0]))) \ + # doctest: +ELLIPSIS + Resampled dataset shape Counter({...}) References ---------- @@ -126,17 +126,36 @@ def __init__(self, n_max_subset=None, classifier=None, estimator=None, - bootstrap=True, **kwargs): - super(BalanceCascade, self).__init__( - ratio=ratio, random_state=random_state) + super(BalanceCascade, self).__init__(ratio=ratio, + random_state=random_state) self.return_indices = return_indices self.classifier = classifier self.estimator = estimator self.n_max_subset = n_max_subset - self.bootstrap = bootstrap self.kwargs = kwargs + def fit(self, X, y): + """Find the classes statistics before to perform sampling. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + self : object, + Return self. + + """ + super(BalanceCascade, self).fit(X, y) + self.ratio_ = check_ratio(self.ratio, y, 'under-sampling') + return self + def _validate_estimator(self): """Private function to create the classifier""" @@ -189,30 +208,6 @@ def _validate_estimator(self): self.logger.debug(self.estimator_) - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(BalanceCascade, self).fit(X, y) - - self._validate_estimator() - - return self - def _sample(self, X, y): """Resample the dataset. @@ -237,183 +232,97 @@ def _sample(self, X, y): containing the which samples have been selected. """ + self._validate_estimator() random_state = check_random_state(self.random_state) - support_sample_weight = has_fit_parameter(self.estimator_, - "sample_weight") + # array to know which samples are available to be taken + samples_mask = np.ones(y.shape, dtype=bool) + + # where the different set will be stored X_resampled = [] y_resampled = [] - if self.return_indices: - idx_under = [] - - # Start with the minority class - X_min = X[y == self.min_c_] - y_min = y[y == self.min_c_] - - # Keep the indices of the minority class somewhere if we need to - # return them later - if self.return_indices: - idx_min = np.flatnonzero(y == self.min_c_) - idx_maj = np.flatnonzero(y == self.maj_c_) + idx_under = [] - # Condition to initiliase before the search - b_subset_search = True n_subsets = 0 - # Get the initial number of samples to select in the majority class - if self.ratio == 'auto': - num_samples = self.stats_c_[self.min_c_] - else: - num_samples = int(self.stats_c_[self.min_c_] / self.ratio) - # Create the array characterising the array containing the majority - # class - N_x = X[y != self.min_c_] - N_y = y[y != self.min_c_] - b_sel_N = np.array([True] * N_y.size) - idx_mis_class = np.array([]) - - # Loop to create the different subsets + b_subset_search = True while b_subset_search: - self.logger.debug('Search boolean: %s', b_subset_search) - # Generate an appropriate number of index to extract - # from the majority class depending of the false classification - # rate of the previous iteration - idx_sel_from_maj = random_state.choice( - np.flatnonzero(b_sel_N), size=num_samples, replace=False) - idx_sel_from_maj = np.concatenate( - (idx_mis_class, idx_sel_from_maj), axis=0).astype(int) - - # Mark these indexes as not being considered for next sampling - b_sel_N[idx_sel_from_maj] = False - - # For now, we will train and classify on the same data - # Let see if we should find another solution. Anyway, - # random stuff are still random stuff - x_data = np.concatenate((X_min, N_x[idx_sel_from_maj, :]), axis=0) - y_data = np.concatenate((y_min, N_y[idx_sel_from_maj]), axis=0) - - # Push these data into a new subset - X_resampled.append(x_data) - y_resampled.append(y_data) - if self.return_indices: - idx_under.append( - np.concatenate( - (idx_min, idx_maj[idx_sel_from_maj]), axis=0)) - - # Get the indices of interest - if self.bootstrap: - indices = random_state.randint(0, y_data.size, y_data.size) - else: - indices = np.arange(y_data.size) - - # Draw samples, using sample weights, and then fit - if support_sample_weight: - self.logger.debug('Sample-weight is supported') - curr_sample_weight = np.ones((y_data.size, ), dtype=np.float64) - - if self.bootstrap: - self.logger.debug('Go for a bootstrap') - sample_counts = np.bincount(indices, minlength=y_data.size) - curr_sample_weight *= sample_counts + target_stats = Counter(y[samples_mask]) + # build the data set to be classified + X_subset = np.empty((0, X.shape[1]), dtype=X.dtype) + y_subset = np.empty((0, ), dtype=y.dtype) + # store the index of the data to under-sample + index_under_sample = np.empty((0, ), dtype=y.dtype) + # value which will be picked at each round + X_constant = np.empty((0, X.shape[1]), dtype=X.dtype) + y_constant = np.empty((0, ), dtype=y.dtype) + index_constant = np.empty((0, ), dtype=y.dtype) + for target_class in target_stats.keys(): + if target_class in self.ratio_.keys(): + n_samples = self.ratio_[target_class] + # extract the data of interest for this round from the + # current class + index_class = np.flatnonzero(y == target_class) + index_class_interest = index_class[samples_mask[ + y == target_class]] + X_class = X[index_class_interest] + y_class = y[index_class_interest] + # select randomly the desired features + index_target_class = random_state.choice( + range(y_class.size), size=n_samples, replace=False) + X_subset = np.concatenate((X_subset, + X_class[index_target_class]), + axis=0) + y_subset = np.concatenate((y_subset, + y_class[index_target_class]), + axis=0) + # index of the data + index_under_sample = np.concatenate( + (index_under_sample, + index_class_interest[index_target_class]), + axis=0) else: - self.logger.debug('No bootstrap') - mask = np.zeros(y_data.size, dtype=np.bool) - mask[indices] = True - not_indices_mask = ~mask - curr_sample_weight[not_indices_mask] = 0 - - self.estimator_.fit(x_data, - y_data, - sample_weight=curr_sample_weight) - - # Draw samples, using a mask, and then fit - else: - self.logger.debug('Sample-weight is not supported') - self.estimator_.fit(x_data[indices], y_data[indices]) - - # Predict using only the majority class - pred_label = self.estimator_.predict(N_x[idx_sel_from_maj, :]) - - # Basically let's find which sample have to be retained for the - # next round - - # Find the misclassified index to keep them for the next round - idx_mis_class = idx_sel_from_maj[np.flatnonzero(pred_label != N_y[ - idx_sel_from_maj])] - self.logger.debug('Elements misclassified: %s', idx_mis_class) - - # Count how many random element will be selected - if self.ratio == 'auto': - num_samples = self.stats_c_[self.min_c_] - else: - num_samples = int(self.stats_c_[self.min_c_] / self.ratio) - num_samples -= idx_mis_class.size - - self.logger.debug('Creation of the subset #%s', n_subsets) - - # We found a new subset, increase the counter + X_constant = np.concatenate((X_constant, + X[y == target_class]), + axis=0) + y_constant = np.concatenate((y_constant, + y[y == target_class]), + axis=0) + index_constant = np.concatenate( + (index_constant, + np.flatnonzero(y == target_class)), + axis=0) + + # store the set created n_subsets += 1 - - # Check if we have to make an early stopping + X_resampled.append(np.concatenate((X_subset, X_constant), + axis=0)) + y_resampled.append(np.concatenate((y_subset, y_constant), + axis=0)) + idx_under.append(np.concatenate((index_under_sample, + index_constant), + axis=0)) + + # fit and predict using cross validation + pred = cross_val_predict(self.estimator_, + np.concatenate((X_subset, X_constant), + axis=0), + np.concatenate((y_subset, y_constant), + axis=0)) + # extract the prediction about the targeted classes only + pred_target = pred[:y_subset.size] + index_classified = index_under_sample[pred_target == y_subset] + samples_mask[index_classified] = False + + # check the stopping criterion if self.n_max_subset is not None: - self.logger.debug('Current number of subset %s', n_subsets) - if n_subsets == (self.n_max_subset - 1): + if n_subsets == self.n_max_subset: + b_subset_search = False + # check that there is enough samples for another round + target_stats = Counter(y[samples_mask]) + for target_class in self.ratio_.keys(): + if target_stats[target_class] < self.ratio_[target_class]: b_subset_search = False - # Select the remaining data - idx_sel_from_maj = np.flatnonzero(b_sel_N) - idx_sel_from_maj = np.concatenate( - (idx_mis_class, idx_sel_from_maj), axis=0).astype(int) - # Select the final batch - x_data = np.concatenate( - (X_min, N_x[idx_sel_from_maj, :]), axis=0) - y_data = np.concatenate( - (y_min, N_y[idx_sel_from_maj]), axis=0) - # Push these data into a new subset - X_resampled.append(x_data) - y_resampled.append(y_data) - if self.return_indices: - idx_under.append( - np.concatenate( - (idx_min, idx_maj[idx_sel_from_maj]), axis=0)) - - self.logger.debug('Creation of the subset #%s', n_subsets) - - # We found a new subset, increase the counter - n_subsets += 1 - - self.logger.debug('The number of subset reached is' - ' maximum.') - break - # Specific case with n_max_subset = 1 - elif n_subsets > (self.n_max_subset - 1): - break - - # Also check that we will have enough sample to extract at the - # next round - if num_samples > np.count_nonzero(b_sel_N): - b_subset_search = False - # Select the remaining data - idx_sel_from_maj = np.flatnonzero(b_sel_N) - idx_sel_from_maj = np.concatenate( - (idx_mis_class, idx_sel_from_maj), axis=0).astype(int) - # Select the final batch - x_data = np.concatenate( - (X_min, N_x[idx_sel_from_maj, :]), axis=0) - y_data = np.concatenate((y_min, N_y[idx_sel_from_maj]), axis=0) - # Push these data into a new subset - X_resampled.append(x_data) - y_resampled.append(y_data) - if self.return_indices: - idx_under.append( - np.concatenate( - (idx_min, idx_maj[idx_sel_from_maj]), axis=0)) - self.logger.debug('Creation of the subset #%s', n_subsets) - - # We found a new subset, increase the counter - n_subsets += 1 - - self.logger.debug('Not enough samples to continue creating' - ' subsets.') if self.return_indices: return (np.array(X_resampled), np.array(y_resampled), diff --git a/imblearn/ensemble/base.py b/imblearn/ensemble/base.py new file mode 100644 index 000000000..87fbd8250 --- /dev/null +++ b/imblearn/ensemble/base.py @@ -0,0 +1,17 @@ +""" +Base class for the ensemble method. +""" +# Authors: Guillaume Lemaitre +# License: MIT + +from ..base import BaseSampler + + +class BaseEnsembleSampler(BaseSampler): + """Base class for ensemble algorithms. + + Warning: This class should not be used directly. Use the derive classes + instead. + """ + + _sampling_type = 'ensemble' diff --git a/imblearn/ensemble/easy_ensemble.py b/imblearn/ensemble/easy_ensemble.py index 5742a2361..382e15574 100644 --- a/imblearn/ensemble/easy_ensemble.py +++ b/imblearn/ensemble/easy_ensemble.py @@ -4,18 +4,17 @@ # Christos Aridas # License: MIT -from __future__ import print_function - import numpy as np + from sklearn.utils import check_random_state -from ..base import BaseMulticlassSampler +from .base import BaseEnsembleSampler from ..under_sampling import RandomUnderSampler MAX_INT = np.iinfo(np.int32).max -class EasyEnsemble(BaseMulticlassSampler): +class EasyEnsemble(BaseEnsembleSampler): """Create an ensemble sets by iteratively applying random under-sampling. This method iteratively select a random subset and make an ensemble of the @@ -23,21 +22,32 @@ class EasyEnsemble(BaseMulticlassSampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. replacement : bool, optional (default=False) Whether or not to sample randomly with replacement or not. @@ -45,26 +55,11 @@ class EasyEnsemble(BaseMulticlassSampler): n_subsets : int, optional (default=10) Number of subsets to generate. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- The method is described in [1]_. - This method supports multiclass target type. + Supports mutli-class resampling. Examples -------- @@ -72,7 +67,7 @@ class EasyEnsemble(BaseMulticlassSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.ensemble import \ - EasyEnsemble # doctest: +NORMALIZE_WHITESPACE +EasyEnsemble # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -98,8 +93,8 @@ def __init__(self, random_state=None, replacement=False, n_subsets=10): - super(EasyEnsemble, self).__init__( - ratio=ratio, random_state=random_state) + super(EasyEnsemble, self).__init__(ratio=ratio, + random_state=random_state) self.return_indices = return_indices self.replacement = replacement self.n_subsets = n_subsets @@ -129,7 +124,6 @@ def _sample(self, X, y): """ - # Check the random state random_state = check_random_state(self.random_state) X_resampled = [] @@ -137,23 +131,12 @@ def _sample(self, X, y): if self.return_indices: idx_under = [] - self.samplers_ = [] - for _ in range(self.n_subsets): rus = RandomUnderSampler( - ratio=self.ratio, - return_indices=self.return_indices, + ratio=self.ratio_, return_indices=True, random_state=random_state.randint(MAX_INT), replacement=self.replacement) - self.samplers_.append(rus) - - for rus in self.samplers_: - - if self.return_indices: - sel_x, sel_y, sel_idx = rus.fit_sample(X, y) - else: - sel_x, sel_y = rus.fit_sample(X, y) - + sel_x, sel_y, sel_idx = rus.fit_sample(X, y) X_resampled.append(sel_x) y_resampled.append(sel_y) if self.return_indices: diff --git a/imblearn/ensemble/tests/test_balance_cascade.py b/imblearn/ensemble/tests/test_balance_cascade.py index 11fac9443..89448d50f 100644 --- a/imblearn/ensemble/tests/test_balance_cascade.py +++ b/imblearn/ensemble/tests/test_balance_cascade.py @@ -1,14 +1,17 @@ """Test the module balance cascade.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np -from numpy.testing import (assert_array_equal, assert_equal, - assert_raises, assert_raises_regex) +from numpy.testing import (assert_array_equal, assert_raises, + assert_raises_regex) from sklearn.ensemble import RandomForestClassifier from imblearn.ensemble import BalanceCascade -# Generate a global dataset to use RND_SEED = 0 X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], @@ -23,608 +26,341 @@ Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) -def test_bc_init(): - # Define a ratio - ratio = 1. - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) - - assert_equal(bc.ratio, ratio) - assert_equal(bc.bootstrap, True) - assert_equal(bc.n_max_subset, None) - assert_equal(bc.random_state, RND_SEED) - - def test_fit_sample_auto(): - # Define the ratio parameter ratio = 'auto' - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - bootstrap=False) - - # Get the different subset + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=True) X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.70472253, -0.73309052], - [-0.18410027, -0.45194484], [0.77481731, 0.60935141], - [0.3084254, 0.33299982], [0.28893132, -0.38761769], - [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - ], - dtype=object) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]], + [[0.28893132, -0.38761769], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.77481731, 0.60935141], + [-0.18410027, -0.45194484], + [1.15514042, 0.0129463], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 8, 14, 1, 7, 9, 15]) - ], - dtype=object) - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) + [[10, 18, 8, 16, 6, 14, 5, 13, 0, 2, 3, 4, 11, 12, 17, 19], + [9, 6, 7, 8, 16, 1, 14, 10, 0, 2, 3, 4, 11, 12, 17, 19]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) + assert_array_equal(idx_under, idx_gt) def test_fit_sample_half(): - # Define the ratio parameter ratio = 0.8 - - # Create the sampling object - bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, bootstrap=False) - - # Get the different subset + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695], - [0.9281014, 0.53085498], [0.3084254, 0.33299982]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.70472253, -0.73309052], - [-0.18410027, -0.45194484], [0.77481731, 0.60935141], - [0.28893132, -0.38761769]]) - ], - dtype=object) - - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - ], - dtype=object) - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.9281014, 0.53085498], + [0.3084254, 0.33299982], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) def test_fit_sample_auto_decision_tree(): - # Define the ratio parameter ratio = 'auto' classifier = 'decision-tree' - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - classifier=classifier) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [-1.11515198, -0.93689695], [0.77481731, 0.60935141], - [0.3084254, 0.33299982], [0.28893132, -0.38761769], - [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 13, 1, 7, 9, 15]) - ], - dtype=object) - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, classifier=classifier) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) def test_fit_sample_auto_random_forest(): - # Define the ratio parameter ratio = 'auto' classifier = 'random-forest' - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - classifier=classifier) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [-0.14374509, 0.27370049], - [-1.11515198, -0.93689695], [0.77481731, 0.60935141], - [0.3084254, 0.33299982], [0.28893132, -0.38761769], - [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 16, 13, 1, 7, 9, 15]) - ], - dtype=object) - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, classifier=classifier) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) def test_fit_sample_auto_adaboost(): - # Define the ratio parameter ratio = 'auto' classifier = 'adaboost' - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - classifier=classifier) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [-0.14374509, 0.27370049], [-1.11515198, -0.93689695], - [0.77481731, 0.60935141], [0.3084254, 0.33299982], - [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 16, 13, 1, 7, 9, 15]) - ], - dtype=object) - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, classifier=classifier) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) def test_fit_sample_auto_gradient_boosting(): - # Define the ratio parameter ratio = 'auto' classifier = 'gradient-boosting' + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, classifier=classifier) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - classifier=classifier) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [-0.14374509, 0.27370049], [-1.11515198, -0.93689695], - [0.77481731, 0.60935141], [0.3084254, 0.33299982], - [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 16, 13, 1, 7, 9, 15]) - ], - dtype=object) - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) +def test_fit_sample_auto_knn(): + ratio = 'auto' + classifier = 'knn' + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, classifier=classifier) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]], + [[0.28893132, -0.38761769], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.77481731, 0.60935141], + [-0.18410027, -0.45194484], + [1.15514042, 0.0129463], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) def test_fit_sample_auto_linear_svm(): - # Define the ratio parameter ratio = 'auto' classifier = 'linear-svm' - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - classifier=classifier) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.70472253, -0.73309052], - [0.77481731, 0.60935141], [0.3084254, 0.33299982], - [0.28893132, -0.38761769], [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 8, 1, 7, 9, 15]) - ], - dtype=object) - - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, classifier=classifier) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]], + [[1.15514042, 0.0129463], + [0.9281014, 0.53085498], + [0.3084254, 0.33299982], + [0.28893132, -0.38761769], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.70472253, -0.73309052], + [0.77481731, 0.60935141], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) def test_init_wrong_classifier(): - # Define the ratio parameter classifier = 'rnd' - bc = BalanceCascade(classifier=classifier) assert_raises(NotImplementedError, bc.fit_sample, X, Y) def test_fit_sample_auto_early_stop(): - # Define the ratio parameter ratio = 'auto' - n_subset = 1 - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - n_max_subset=n_subset) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array([[[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]]) - - y_gt = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]) - idx_gt = np.array( - [[0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, 13]]) - # Check each array + classifier = 'linear-svm' + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, classifier=classifier, + n_max_subset=1) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) - assert_array_equal(idx_under, idx_gt) - - -def test_fit_sample_auto_early_stop_2(): - # Define the ratio parameter - ratio = 'auto' - n_subset = 2 - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - n_max_subset=n_subset, - bootstrap=False) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.70472253, -0.73309052], - [-0.18410027, -0.45194484], [0.77481731, 0.60935141], - [0.3084254, 0.33299982], [0.28893132, -0.38761769], - [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 8, 14, 1, 7, 9, 15]) - ], - dtype=object) - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) def test_give_classifier_obj(): - # Define the ratio parameter ratio = 'auto' classifier = RandomForestClassifier(random_state=RND_SEED) - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - estimator=classifier) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [-0.14374509, 0.27370049], - [-1.11515198, -0.93689695], [0.77481731, 0.60935141], - [0.3084254, 0.33299982], [0.28893132, -0.38761769], - [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 16, 13, 1, 7, 9, 15]) - ], - dtype=object) - - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=False, estimator=classifier) + X_resampled, y_resampled = bc.fit_sample(X, Y) + X_gt = np.array([[[1.15514042, 0.0129463], + [0.08711622, 0.93259929], + [0.70472253, -0.73309052], + [-0.14374509, 0.27370049], + [0.83680821, 1.72827342], + [-0.18410027, -0.45194484], + [-0.28162401, -2.10400981], + [-1.11515198, -0.93689695], + [0.11622591, -0.0317206], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-0.41635887, -0.38299653], + [1.70580611, -0.11219234]]]) + y_gt = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) def test_give_classifier_wrong_obj(): - # Define the ratio parameter ratio = 'auto' classifier = 2 - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - estimator=classifier) - - # Get the different subset + bc = BalanceCascade(ratio=ratio, random_state=RND_SEED, + return_indices=True, estimator=classifier) assert_raises_regex(ValueError, "Invalid parameter `estimator`", bc.fit_sample, X, Y) - - -def test_rf_wth_bootstrap(): - # Define the ratio parameter - ratio = 'auto' - classifier = RandomForestClassifier(random_state=RND_SEED) - - # Create the sampling object - bc = BalanceCascade( - ratio=ratio, - random_state=RND_SEED, - return_indices=True, - estimator=classifier, - bootstrap=False) - - # Get the different subset - X_resampled, y_resampled, idx_under = bc.fit_sample(X, Y) - - X_gt = np.array( - [ - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.08711622, 0.93259929], - [0.70472253, -0.73309052], [-0.14374509, 0.27370049], - [0.83680821, 1.72827342], [-0.18410027, -0.45194484], - [-0.28162401, -2.10400981], [-1.11515198, -0.93689695]]), - np.array([[0.11622591, -0.0317206], [1.25192108, -0.22367336], - [0.53366841, -0.30312976], [1.52091956, -0.49283504], - [0.88407872, 0.35454207], [1.31301027, -0.92648734], - [-0.41635887, -0.38299653], [1.70580611, -0.11219234], - [1.15514042, 0.0129463], [0.77481731, 0.60935141], - [0.3084254, 0.33299982], [0.28893132, -0.38761769], - [0.9281014, 0.53085498]]) - ], - dtype=object) - y_gt = np.array( - [ - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), - np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - ], - dtype=object) - idx_gt = np.array( - [ - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 18, 8, 16, 6, 14, 5, - 13]), - np.array([0, 2, 3, 4, 11, 12, 17, 19, 10, 1, 7, 9, 15]) - ], - dtype=object) - - # Check each array - for idx in range(X_gt.size): - assert_array_equal(X_resampled[idx], X_gt[idx]) - assert_array_equal(y_resampled[idx], y_gt[idx]) - assert_array_equal(idx_under[idx], idx_gt[idx]) diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py index 34351e75a..2b05d57c2 100644 --- a/imblearn/ensemble/tests/test_easy_ensemble.py +++ b/imblearn/ensemble/tests/test_easy_ensemble.py @@ -1,4 +1,8 @@ """Test the module easy ensemble.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py index 87d25b365..2ab75806a 100644 --- a/imblearn/metrics/tests/test_classification.py +++ b/imblearn/metrics/tests/test_classification.py @@ -1,4 +1,8 @@ """Testing the metric for classification with imbalanced dataset""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import division, print_function diff --git a/imblearn/metrics/tests/test_score_objects.py b/imblearn/metrics/tests/test_score_objects.py index 08a268d6b..514915f08 100644 --- a/imblearn/metrics/tests/test_score_objects.py +++ b/imblearn/metrics/tests/test_score_objects.py @@ -1,3 +1,8 @@ +"""Test for score""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from numpy.testing import assert_allclose import sklearn diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py index 9a4ee479d..4b94d047a 100644 --- a/imblearn/over_sampling/__init__.py +++ b/imblearn/over_sampling/__init__.py @@ -3,8 +3,10 @@ perform over-sampling. """ +from .adasyn import ADASYN from .random_over_sampler import RandomOverSampler from .smote import SMOTE -from .adasyn import ADASYN -__all__ = ['RandomOverSampler', 'SMOTE', 'ADASYN'] +__all__ = ['ADASYN', + 'RandomOverSampler', + 'SMOTE'] diff --git a/imblearn/over_sampling/adasyn.py b/imblearn/over_sampling/adasyn.py index 04995e08f..ea99efaac 100644 --- a/imblearn/over_sampling/adasyn.py +++ b/imblearn/over_sampling/adasyn.py @@ -4,18 +4,17 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function - -from collections import Counter +from __future__ import division import numpy as np from sklearn.utils import check_random_state -from ..base import BaseBinarySampler +from .base import BaseOverSampler from ..utils import check_neighbors_object +from ..utils.deprecation import deprecate_parameter -class ADASYN(BaseBinarySampler): +class ADASYN(BaseOverSampler): """Perform over-sampling using ADASYN. Perform over-sampling using Adaptive Synthetic Sampling Approach for @@ -23,52 +22,48 @@ class ADASYN(BaseBinarySampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. k : int, optional (default=None) Number of nearest neighbours to used to construct synthetic samples. - NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + ``k`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. n_neighbors : int int or object, optional (default=5) - If int, number of nearest neighbours to used to construct - synthetic samples. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + If ``int``, number of nearest neighbours to used to construct synthetic + samples. If object, an estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. n_jobs : int, optional (default=1) Number of threads to run the algorithm when it is possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- - Does not support multi-class. + Supports mutli-class resampling. The implementation is based on [1]_. @@ -78,7 +73,7 @@ class ADASYN(BaseBinarySampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ - ADASYN # doctest: +NORMALIZE_WHITESPACE +ADASYN # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, @@ -110,32 +105,15 @@ def __init__(self, self.n_neighbors = n_neighbors self.n_jobs = n_jobs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. + def _validate_estimator(self): + """Create the necessary objects for ADASYN""" + # FIXME: Deprecated in 0.2. To be removed in 0.4. + deprecate_parameter(self, '0.2', 'k', 'n_neighbors') - """ - - super(ADASYN, self).fit(X, y) self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, additional_neighbor=1) - # set the number of jobs self.nn_.set_params(**{'n_jobs': self.n_jobs}) - return self - def _sample(self, X, y): """Resample the dataset. @@ -156,65 +134,48 @@ def _sample(self, X, y): The corresponding label of `X_resampled` """ + self._validate_estimator() random_state = check_random_state(self.random_state) - # Keep the samples from the majority class X_resampled = X.copy() y_resampled = y.copy() - # Define the number of sample to create - # We handle only two classes problem for the moment. - if self.ratio == 'auto': - num_samples = ( - self.stats_c_[self.maj_c_] - self.stats_c_[self.min_c_]) - else: - num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) - - self.stats_c_[self.min_c_]) - - # Start by separating minority class features and target values. - X_min = X[y == self.min_c_] - - # Print if verbose is true - self.logger.debug('Finding the %s nearest neighbours ...', - self.nn_.n_neighbors - 1) - - # Look for k-th nearest neighbours, excluding, of course, the - # point itself. - self.nn_.fit(X) - - # Get the distance to the NN - _, ind_nn = self.nn_.kneighbors(X_min) - - # Compute the ratio of majority samples next to minority samples - ratio_nn = (np.sum(y[ind_nn[:, 1:]] == self.maj_c_, axis=1) / - (self.nn_.n_neighbors - 1)) - # Check that we found at least some neighbours belonging to the - # majority class - if not np.sum(ratio_nn): - raise RuntimeError('Not any neigbours belong to the majority' - ' class. This case will induce a NaN case with' - ' a division by zero. ADASYN is not suited for' - ' this specific dataset. Use SMOTE.') - # Normalize the ratio - ratio_nn /= np.sum(ratio_nn) - - # Compute the number of sample to be generated - num_samples_nn = np.round(ratio_nn * num_samples).astype(int) - - # For each minority samples - for x_i, x_i_nn, num_sample_i in zip(X_min, ind_nn, num_samples_nn): - - # Pick-up the neighbors wanted - nn_zs = random_state.randint( - 1, high=self.nn_.n_neighbors, size=num_sample_i) - - # Create a new sample - for nn_z in nn_zs: - step = random_state.uniform() - x_gen = x_i + step * (X[x_i_nn[nn_z], :] - x_i) - X_resampled = np.vstack((X_resampled, x_gen)) - y_resampled = np.hstack((y_resampled, self.min_c_)) - - self.logger.info('Over-sampling performed: %s', Counter(y_resampled)) + for class_sample, n_samples in self.ratio_.items(): + if n_samples == 0: + continue + X_class = X[y == class_sample] + + self.nn_.fit(X) + _, nn_index = self.nn_.kneighbors(X_class) + # The ratio is computed using a one-vs-rest manner. Using majority + # in multi-class would lead to slightly different results at the + # cost of introducing a new parameter. + ratio_nn = (np.sum(y[nn_index[:, 1:]] != class_sample, axis=1) / + (self.nn_.n_neighbors - 1)) + if not np.sum(ratio_nn): + raise RuntimeError('Not any neigbours belong to the majority' + ' class. This case will induce a NaN case' + ' with a division by zero. ADASYN is not' + ' suited for this specific dataset.' + ' Use SMOTE instead.') + ratio_nn /= np.sum(ratio_nn) + n_samples_generate = np.rint(ratio_nn * n_samples).astype(int) + + x_class_gen = [] + for x_i, x_i_nn, num_sample_i in zip(X_class, nn_index, + n_samples_generate): + if num_sample_i == 0: + continue + nn_zs = random_state.randint( + 1, high=self.nn_.n_neighbors, size=num_sample_i) + steps = random_state.uniform(size=len(nn_zs)) + x_class_gen.append([x_i + step * (X[x_i_nn[nn_z], :] - x_i) + for step, nn_z in zip(steps, nn_zs)]) + + if len(x_class_gen) > 0: + X_resampled = np.vstack((X_resampled, + np.concatenate(x_class_gen))) + y_resampled = np.hstack((y_resampled, [class_sample] * + np.sum(n_samples_generate))) return X_resampled, y_resampled diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py new file mode 100644 index 000000000..9c1f6d51b --- /dev/null +++ b/imblearn/over_sampling/base.py @@ -0,0 +1,18 @@ +""" +Base class for the over-sampling method. +""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + +from ..base import BaseSampler + + +class BaseOverSampler(BaseSampler): + """Base class for over-sampling algorithms. + + Warning: This class should not be used directly. Use the derive classes + instead. + """ + + _sampling_type = 'over-sampling' diff --git a/imblearn/over_sampling/random_over_sampler.py b/imblearn/over_sampling/random_over_sampler.py index 0ba501397..1d6752813 100644 --- a/imblearn/over_sampling/random_over_sampler.py +++ b/imblearn/over_sampling/random_over_sampler.py @@ -3,18 +3,17 @@ # Authors: Guillaume Lemaitre # Christos Aridas # License: MIT - -from __future__ import division, print_function +from __future__ import division from collections import Counter import numpy as np from sklearn.utils import check_random_state -from ..base import BaseMulticlassSampler +from .base import BaseOverSampler -class RandomOverSampler(BaseMulticlassSampler): +class RandomOverSampler(BaseOverSampler): """Class to perform random over-sampling. Object to over-sample the minority class(es) by picking samples at random @@ -22,36 +21,32 @@ class RandomOverSampler(BaseMulticlassSampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. - - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. Notes ----- - Supports multiple classes. + Supports mutli-class resampling. Examples -------- @@ -59,7 +54,7 @@ class RandomOverSampler(BaseMulticlassSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ - RandomOverSampler # doctest: +NORMALIZE_WHITESPACE +RandomOverSampler # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -73,7 +68,6 @@ class RandomOverSampler(BaseMulticlassSampler): """ def __init__(self, ratio='auto', random_state=None): - super(RandomOverSampler, self).__init__( ratio=ratio, random_state=random_state) @@ -97,38 +91,22 @@ def _sample(self, X, y): The corresponding label of `X_resampled` """ + random_state = check_random_state(self.random_state) + target_stats = Counter(y) - # Keep the samples from the majority class - X_resampled = X[y == self.maj_c_] - y_resampled = y[y == self.maj_c_] - - # Loop over the other classes over picking at random - for key in self.stats_c_.keys(): - - # If this is the majority class, skip it - if key == self.maj_c_: - continue - - # Define the number of sample to create - if self.ratio == 'auto': - num_samples = int(self.stats_c_[self.maj_c_] - self.stats_c_[ - key]) - else: - num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) - - self.stats_c_[key]) - - # Pick some elements at random - random_state = check_random_state(self.random_state) - indx = random_state.randint( - low=0, high=self.stats_c_[key], size=num_samples) + X_resampled = X.copy() + y_resampled = y.copy() - # Concatenate to the majority class - X_resampled = np.concatenate( - (X_resampled, X[y == key], X[y == key][indx]), axis=0) + for class_sample, num_samples in self.ratio_.items(): + index_samples = random_state.randint( + low=0, high=target_stats[class_sample], size=num_samples) - y_resampled = np.concatenate( - (y_resampled, y[y == key], y[y == key][indx]), axis=0) + X_resampled = np.concatenate((X_resampled, + X[y == class_sample][index_samples]), + axis=0) - self.logger.info('Over-sampling performed: %s', Counter(y_resampled)) + y_resampled = np.concatenate((y_resampled, + y[y == class_sample][index_samples]), + axis=0) return X_resampled, y_resampled diff --git a/imblearn/over_sampling/smote.py b/imblearn/over_sampling/smote.py index 715a40f6c..84e360ffb 100644 --- a/imblearn/over_sampling/smote.py +++ b/imblearn/over_sampling/smote.py @@ -5,20 +5,22 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division import numpy as np from sklearn.svm import SVC -from sklearn.utils import check_array, check_random_state +from sklearn.utils import check_random_state -from ..base import BaseBinarySampler -from ..utils import check_neighbors_object +from .base import BaseOverSampler from ..exceptions import raise_isinstance_error +from ..utils import check_neighbors_object +from ..utils.deprecation import deprecate_parameter + SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm') -class SMOTE(BaseBinarySampler): +class SMOTE(BaseOverSampler): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority @@ -27,81 +29,77 @@ class SMOTE(BaseBinarySampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. k : int, optional (default=None) Number of nearest neighbours to used to construct synthetic samples. - NOTE: `k` is deprecated from 0.2 and will be replaced in 0.4 - Use ``k_neighbors`` instead. + .. deprecated:: 0.2 + ``k`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``k_neighbors`` instead. k_neighbors : int or object, optional (default=5) - If int, number of nearest neighbours to used to construct - synthetic samples. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + If ``int``, number of nearest neighbours to used to construct synthetic + samples. If object, an estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. m : int, optional (default=None) Number of nearest neighbours to use to determine if a minority sample - is in danger. Used with kind={'borderline1', 'borderline2', 'svm'}. + is in danger. Used with ``kind={'borderline1', 'borderline2', + 'svm'}``. - NOTE: `m` is deprecated from 0.2 and will be replaced in 0.4 - Use ``m_neighbors`` instead. + .. deprecated:: 0.2 + ``m`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``m_neighbors`` instead. m_neighbors : int int or object, optional (default=10) If int, number of nearest neighbours to use to determine if a minority - sample is in danger. Used with kind={'borderline1', 'borderline2', - 'svm'}. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + sample is in danger. Used with ``kind={'borderline1', 'borderline2', + 'svm'}``. If object, an estimator that inherits + from :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used + to find the k_neighbors. out_step : float, optional (default=0.5) - Step size when extrapolating. Used with kind='svm'. + Step size when extrapolating. Used with ``kind='svm'``. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: - 'regular', 'borderline1', 'borderline2', 'svm'. + ``'regular'``, ``'borderline1'``, ``'borderline2'``, ``'svm'``. svm_estimator : object, optional (default=SVC()) - If `kind='svm'`, a parametrized `sklearn.svm.SVC` classifier can - be passed. + If ``kind='svm'``, a parametrized :class:`sklearn.svm.SVC` + classifier can be passed. n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- See the original papers: [1]_, [2]_, [3]_ for more details. - It does not support multiple classes automatically, but can be called - multiple times. + Supports mutli-class resampling. Examples -------- @@ -109,7 +107,7 @@ class SMOTE(BaseBinarySampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.over_sampling import \ - SMOTE # doctest: +NORMALIZE_WHITESPACE +SMOTE # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -157,7 +155,7 @@ def __init__(self, self.svm_estimator = svm_estimator self.n_jobs = n_jobs - def _in_danger_noise(self, samples, y, kind='danger'): + def _in_danger_noise(self, samples, target_class, y, kind='danger'): """Estimate if a set of sample are in danger or noise. Parameters @@ -165,6 +163,9 @@ def _in_danger_noise(self, samples, y, kind='danger'): samples : ndarray, shape (n_samples, n_features) The samples to check if either they are in danger or not. + target_class : int or str, + The target corresponding class being over-sampled. + y : ndarray, shape (n_samples, ) The true label in order to check the neighbour labels. @@ -180,21 +181,14 @@ def _in_danger_noise(self, samples, y, kind='danger'): A boolean array where True refer to samples in danger or noise. """ - - # Find the NN for each samples - # Exclude the sample itself x = self.nn_m_.kneighbors(samples, return_distance=False)[:, 1:] - - # Count how many NN belong to the minority class - # Find the class corresponding to the label in x - nn_label = (y[x] != self.min_c_).astype(int) - # Compute the number of majority samples in the NN + nn_label = (y[x] != target_class).astype(int) n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and( - n_maj >= float(self.nn_m_.n_neighbors - 1) / 2., + n_maj >= (self.nn_m_.n_neighbors - 1) / 2, n_maj < self.nn_m_.n_neighbors - 1) elif kind == 'noise': # Samples are noise for m = m' @@ -243,93 +237,45 @@ def _make_samples(self, Target values for synthetic samples. """ - - # Check the consistency of X - X = check_array(X) - # Check the random state random_state = check_random_state(self.random_state) - - # A matrix to store the synthetic samples X_new = np.zeros((n_samples, X.shape[1])) - - # # Set seeds - # seeds = random_state.randint(low=0, - # high=100 * len(nn_num.flatten()), - # size=n_samples) - - # Randomly pick samples to construct neighbours from samples = random_state.randint( low=0, high=len(nn_num.flatten()), size=n_samples) - - # Loop over the NN matrix and create new samples - for i, n in enumerate(samples): - # NN lines relate to original sample, columns to its - # nearest neighbours - row, col = divmod(n, nn_num.shape[1]) - - # Take a step of random size (0,1) in the direction of the - # n nearest neighbours - # if self.random_state is None: - # np.random.seed(seeds[i]) - # else: - # np.random.seed(self.random_state) - step = step_size * random_state.uniform() - - # Construct synthetic sample + steps = step_size * random_state.uniform(size=n_samples) + rows = np.floor_divide(samples, nn_num.shape[1]) + cols = np.mod(samples, nn_num.shape[1]) + for i, (sample, row, col, step) in enumerate(zip(samples, rows, + cols, steps)): X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) - - # The returned target vector is simply a repetition of the - # minority label y_new = np.array([y_type] * len(X_new)) - self.logger.info('Generated %s new samples ...', len(X_new)) - return X_new, y_new def _validate_estimator(self): - # --- NN object - # Import the NN object from scikit-learn library. Since in the smote - # variations we must first find samples that are in danger, we - # initialize the NN object differently depending on the method chosen - if self.kind == 'regular': - # Regular smote does not look for samples in danger, instead it - # creates synthetic samples directly from the k-th nearest - # neighbours with not filtering - self.nn_k_ = check_neighbors_object('k_neighbors', - self.k_neighbors, - additional_neighbor=1) - # set the number of jobs - self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) + """Create the necessary objects for SMOTE.""" - else: - # Borderline1, 2 and SVM variations of smote must first look for - # samples that could be considered noise and samples that live - # near the boundary between the classes. Therefore, before - # creating synthetic samples from the k-th nns, it first look - # for m nearest neighbors to decide whether or not a sample is - # noise or near the boundary. - self.nn_k_ = check_neighbors_object('k_neighbors', - self.k_neighbors, - additional_neighbor=1) - # set the number of jobs - self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) + # FIXME Deprecated in 0.2, to be removed in 0.4 + deprecate_parameter(self, '0.2', 'k', 'k_neighbors') + deprecate_parameter(self, '0.2', 'm', 'm_neighbors') + if self.kind not in SMOTE_KIND: + raise ValueError('Unknown kind for SMOTE algorithm.' + ' Choices are {}. Got {} instead.'.format( + SMOTE_KIND, self.kind)) + + self.nn_k_ = check_neighbors_object('k_neighbors', + self.k_neighbors, + additional_neighbor=1) + self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) + + if self.kind != 'regular': self.nn_m_ = check_neighbors_object('m_neighbors', self.m_neighbors, additional_neighbor=1) - # set the number of jobs self.nn_m_.set_params(**{'n_jobs': self.n_jobs}) - # --- SVM smote - # Unlike the borderline variations, the SVM variation uses the support - # vectors to decide which samples are in danger (near the boundary). - # Additionally it also introduces extrapolation for samples that are - # considered safe (far from boundary) and interpolation for samples - # in danger (near the boundary). The level of extrapolation is - # controled by the out_step. if self.kind == 'svm': if self.svm_estimator is None: - # Store SVM object with any parameters self.svm_estimator_ = SVC(random_state=self.random_state) elif isinstance(self.svm_estimator, SVC): self.svm_estimator_ = self.svm_estimator @@ -337,8 +283,10 @@ def _validate_estimator(self): raise_isinstance_error('svm_estimator', [SVC], self.svm_estimator) - def fit(self, X, y): - """Find the classes statistics before to perform sampling. + def _sample_regular(self, X, y): + """Resample the dataset using the regular SMOTE implementation. + + Use the regular SMOTE algorithm proposed in [1]_. Parameters ---------- @@ -350,19 +298,44 @@ def fit(self, X, y): Returns ------- - self : object, - Return self. + X_resampled : ndarray, shape (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : ndarray, shape (n_samples_new) + The corresponding label of `X_resampled`. + + References + ---------- + .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: + synthetic minority over-sampling technique," Journal of artificial + intelligence research, 321-357, 2002. """ + X_resampled = X.copy() + y_resampled = y.copy() - super(SMOTE, self).fit(X, y) + for class_sample, n_samples in self.ratio_.items(): + if n_samples == 0: + continue + X_class = X[y == class_sample] - self._validate_estimator() + self.nn_k_.fit(X_class) + nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:] + X_new, y_new = self._make_samples(X_class, class_sample, X_class, + nns, n_samples, 1.0) - return self + X_resampled = np.concatenate((X_resampled, X_new), axis=0) + y_resampled = np.concatenate((y_resampled, y_new), axis=0) - def _sample(self, X, y): - """Resample the dataset. + return X_resampled, y_resampled + + def _sample_borderline(self, X, y): + """Resample the dataset using the borderline SMOTE implementation. + + Use the borderline SMOTE algorithm proposed in [2]_. Two methods can be + used: (i) borderline-1 or (ii) borderline-2. A nearest-neighbours + algorithm is used to determine the samples forming the boundaries and + will create samples next to those features depending on some criterion. Parameters ---------- @@ -378,225 +351,177 @@ def _sample(self, X, y): The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled` - - """ + The corresponding label of `X_resampled`. - if self.kind not in SMOTE_KIND: - raise ValueError('Unknown kind for SMOTE algorithm.' - ' Choices are {}. Got {} instead.'.format( - SMOTE_KIND, self.kind)) - - random_state = check_random_state(self.random_state) - - # Define the number of sample to create - # We handle only two classes problem for the moment. - if self.ratio == 'auto': - num_samples = ( - self.stats_c_[self.maj_c_] - self.stats_c_[self.min_c_]) - else: - num_samples = int((self.ratio * self.stats_c_[self.maj_c_]) - - self.stats_c_[self.min_c_]) - - # Start by separating minority class features and target values. - X_min = X[y == self.min_c_] - - # If regular SMOTE is to be performed - if self.kind == 'regular': - - self.logger.debug('Finding the %s nearest neighbours ...', - self.nn_k_.n_neighbors - 1) - - # Look for k-th nearest neighbours, excluding, of course, the - # point itself. - self.nn_k_.fit(X_min) - - # Matrix with k-th nearest neighbours indexes for each minority - # element. - nns = self.nn_k_.kneighbors(X_min, return_distance=False)[:, 1:] - - self.logger.debug('Create synthetic samples ...') - - # --- Generating synthetic samples - # Use static method make_samples to generate minority samples - X_new, y_new = self._make_samples(X_min, self.min_c_, X_min, nns, - num_samples, 1.0) - - # Concatenate the newly generated samples to the original data set - X_resampled = np.concatenate((X, X_new), axis=0) - y_resampled = np.concatenate((y, y_new), axis=0) - - return X_resampled, y_resampled + References + ---------- + .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new + over-sampling method in imbalanced data sets learning," Advances in + intelligent computing, 878-887, 2005. - if self.kind == 'borderline1' or self.kind == 'borderline2': + """ + X_resampled = X.copy() + y_resampled = y.copy() - self.logger.debug('Finding the %s nearest neighbours ...', - self.nn_m_.n_neighbors - 1) + for class_sample, n_samples in self.ratio_.items(): + if n_samples == 0: + continue + X_class = X[y == class_sample] - # Find the NNs for all samples in the data set. self.nn_m_.fit(X) - - # Boolean array with True for minority samples in danger - danger_index = self._in_danger_noise(X_min, y, kind='danger') - - # If all minority samples are safe, return the original data set. + danger_index = self._in_danger_noise(X_class, class_sample, y, + kind='danger') if not any(danger_index): - self.logger.debug('There are no samples in danger. No' - ' borderline synthetic samples created.') - - # All are safe, nothing to be done here. - return X, y + continue - # If we got here is because some samples are in danger, we need to - # find the NNs among the minority class to create the new synthetic - # samples. - # - # We start by changing the number of NNs to consider from m + 1 - # to k + 1 - self.nn_k_.fit(X_min) - - # nns...# + self.nn_k_.fit(X_class) nns = self.nn_k_.kneighbors( - X_min[danger_index], return_distance=False)[:, 1:] + X_class[danger_index], return_distance=False)[:, 1:] - # B1 and B2 types diverge here!!! + # divergence between borderline-1 and borderline-2 if self.kind == 'borderline1': # Create synthetic samples for borderline points. - X_new, y_new = self._make_samples( - X_min[danger_index], self.min_c_, X_min, nns, num_samples) - - # Concatenate the newly generated samples to the original - # dataset - X_resampled = np.concatenate((X, X_new), axis=0) - y_resampled = np.concatenate((y, y_new), axis=0) - - return X_resampled, y_resampled + X_new, y_new = self._make_samples(X_class[danger_index], + class_sample, X_class, + nns, n_samples) + X_resampled = np.concatenate((X_resampled, X_new), axis=0) + y_resampled = np.concatenate((y_resampled, y_new), axis=0) else: - # Split the number of synthetic samples between only minority - # (type 1), or minority and majority (with reduced step size) - # (type 2). - # The fraction is sampled from a beta distribution centered - # around 0.5 with variance ~0.01 + random_state = check_random_state(self.random_state) fractions = random_state.beta(10, 10) - # Only minority + # only minority X_new_1, y_new_1 = self._make_samples( - X_min[danger_index], - self.min_c_, - X_min, - nns, - int(fractions * (num_samples + 1)), - step_size=1.) - - # Only majority with smaller step size + X_class[danger_index], class_sample, X_class, nns, + int(fractions * (n_samples + 1)), step_size=1.) + + # we use a one-vs-rest policy to handle the multiclass in which + # new samples will be created considering not only the majority + # class but all over classes. X_new_2, y_new_2 = self._make_samples( - X_min[danger_index], - self.min_c_, - X[y != self.min_c_], - nns, - int((1 - fractions) * num_samples), - step_size=0.5) + X_class[danger_index], class_sample, X[y != class_sample], + nns, int((1 - fractions) * n_samples), step_size=0.5) # Concatenate the newly generated samples to the original # data set - X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) - y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) - - return X_resampled, y_resampled + X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), + axis=0) + y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), + axis=0) - if self.kind == 'svm': - # The SVM smote model fits a support vector machine - # classifier to the data and uses the support vector to - # provide a notion of boundary. Unlike regular smote, where - # such notion relies on proportion of nearest neighbours - # belonging to each class. + return X_resampled, y_resampled - # Fit SVM to the full data# - self.svm_estimator_.fit(X, y) + def _sample_svm(self, X, y): + """Resample the dataset using the SVM SMOTE implementation. - # Find the support vectors and their corresponding indexes - support_index = self.svm_estimator_.support_[y[ - self.svm_estimator_.support_] == self.min_c_] - support_vector = X[support_index] + Use the SVM SMOTE algorithm proposed in [3]_. A SVM classifier detect + support vectors to get a notion of the boundary. - # First, find the nn of all the samples to identify samples - # in danger and noisy ones - self.logger.debug('Finding the %s nearest neighbours ...', - self.nn_m_.n_neighbors - 1) + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. - # As usual, fit a nearest neighbour model to the data - self.nn_m_.fit(X) + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. - # Now, get rid of noisy support vectors - noise_bool = self._in_danger_noise(support_vector, y, kind='noise') + Returns + ------- + X_resampled : ndarray, shape (n_samples_new, n_features) + The array containing the resampled data. - # Remove noisy support vectors - support_vector = support_vector[np.logical_not(noise_bool)] - danger_bool = self._in_danger_noise( - support_vector, y, kind='danger') - safety_bool = np.logical_not(danger_bool) + y_resampled : ndarray, shape (n_samples_new) + The corresponding label of `X_resampled`. - self.logger.debug('Out of %s support vectors, %s are noisy, ' - '%s are in danger ' - 'and %s are safe.', support_vector.shape[0], - noise_bool.sum().astype(int), - danger_bool.sum().astype(int), - safety_bool.sum().astype(int)) + References + ---------- + .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling + for imbalanced data classification," International Journal of + Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. - # Proceed to find support vectors NNs among the minority class - self.logger.debug('Finding the %s nearest neighbours ...', - self.nn_k_.n_neighbors - 1) + """ + random_state = check_random_state(self.random_state) + X_resampled = X.copy() + y_resampled = y.copy() - self.nn_k_.fit(X_min) + for class_sample, n_samples in self.ratio_.items(): + if n_samples == 0: + continue + X_class = X[y == class_sample] - self.logger.debug('Create synthetic samples ...') + self.svm_estimator_.fit(X, y) + support_index = self.svm_estimator_.support_[ + y[self.svm_estimator_.support_] == class_sample] + support_vector = X[support_index] - # Split the number of synthetic samples between interpolation and - # extrapolation + self.nn_m_.fit(X) + noise_bool = self._in_danger_noise(support_vector, class_sample, y, + kind='noise') + support_vector = support_vector[np.logical_not(noise_bool)] + danger_bool = self._in_danger_noise(support_vector, class_sample, + y, kind='danger') + safety_bool = np.logical_not(danger_bool) - # The fraction are sampled from a beta distribution with mean - # 0.5 and variance 0.01# + self.nn_k_.fit(X_class) fractions = random_state.beta(10, 10) - - # Interpolate samples in danger if np.count_nonzero(danger_bool) > 0: - nns = self.nn_k_.kneighbors( - support_vector[danger_bool], return_distance=False)[:, 1:] + nns = self.nn_k_.kneighbors(support_vector[danger_bool], + return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( - support_vector[danger_bool], - self.min_c_, - X_min, - nns, - int(fractions * (num_samples + 1)), - step_size=1.) - - # Extrapolate safe samples + support_vector[danger_bool], class_sample, X_class, + nns, int(fractions * (n_samples + 1)), step_size=1.) + if np.count_nonzero(safety_bool) > 0: - nns = self.nn_k_.kneighbors( - support_vector[safety_bool], return_distance=False)[:, 1:] + nns = self.nn_k_.kneighbors(support_vector[safety_bool], + return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( - support_vector[safety_bool], - self.min_c_, - X_min, - nns, - int((1 - fractions) * num_samples), + support_vector[safety_bool], class_sample, X_class, + nns, int((1 - fractions) * n_samples), step_size=-self.out_step) - # Concatenate the newly generated samples to the original data set if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): - X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) - y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) - # not any support vectors in danger + X_resampled = np.concatenate((X_resampled, X_new_1, X_new_2), + axis=0) + y_resampled = np.concatenate((y_resampled, y_new_1, y_new_2), + axis=0) elif np.count_nonzero(danger_bool) == 0: - X_resampled = np.concatenate((X, X_new_2), axis=0) - y_resampled = np.concatenate((y, y_new_2), axis=0) - # All the support vector in danger + X_resampled = np.concatenate((X_resampled, X_new_2), axis=0) + y_resampled = np.concatenate((y_resampled, y_new_2), axis=0) elif np.count_nonzero(safety_bool) == 0: - X_resampled = np.concatenate((X, X_new_1), axis=0) - y_resampled = np.concatenate((y, y_new_1), axis=0) + X_resampled = np.concatenate((X_resampled, X_new_1), axis=0) + y_resampled = np.concatenate((y_resampled, y_new_1), axis=0) + + return X_resampled, y_resampled + + def _sample(self, X, y): + """Resample the dataset. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. - return X_resampled, y_resampled + y : ndarray, shape (n_samples, ) + Corresponding label for each sample in X. + + Returns + ------- + X_resampled : ndarray, shape (n_samples_new, n_features) + The array containing the resampled data. + + y_resampled : ndarray, shape (n_samples_new) + The corresponding label of `X_resampled` + + """ + self._validate_estimator() + + if self.kind == 'regular': + return self._sample_regular(X, y) + elif self.kind == 'borderline1' or self.kind == 'borderline2': + return self._sample_borderline(X, y) + elif self.kind == 'svm': + return self._sample_svm(X, y) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index ab6c7932f..81fdd6261 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -1,4 +1,8 @@ """Test the module under sampler.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -8,7 +12,6 @@ from imblearn.over_sampling import ADASYN -# Generate a global dataset to use RND_SEED = 0 X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], @@ -25,31 +28,20 @@ def test_ada_init(): - # Define a ratio ratio = 'auto' ada = ADASYN(ratio=ratio, random_state=RND_SEED) - assert_equal(ada.random_state, RND_SEED) def test_ada_fit(): - # Create the object ada = ADASYN(random_state=RND_SEED) - # Fit the data ada.fit(X, Y) - - # Check if the data information have been computed - assert_equal(ada.min_c_, 0) - assert_equal(ada.maj_c_, 1) - assert_equal(ada.stats_c_[0], 8) - assert_equal(ada.stats_c_[1], 12) + assert_equal(ada.ratio_, {0: 4, 1: 0}) def test_ada_fit_sample(): - # Resample the data ada = ADASYN(random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -70,11 +62,9 @@ def test_ada_fit_sample(): def test_ada_fit_sample_half(): - # Resample the data ratio = 0.8 ada = ADASYN(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ada.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -92,11 +82,9 @@ def test_ada_fit_sample_half(): def test_ada_fit_sample_nn_obj(): - # Resample the data nn = NearestNeighbors(n_neighbors=6) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -117,7 +105,6 @@ def test_ada_fit_sample_nn_obj(): def test_ada_wrong_nn_obj(): - # Resample the data nn = 'rnd' ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) assert_raises_regex(ValueError, "has to be one of", diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index d5cc566db..731389246 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -1,4 +1,8 @@ """Test the module under sampler.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function from collections import Counter @@ -8,9 +12,7 @@ from imblearn.over_sampling import RandomOverSampler -# Generate a global dataset to use RND_SEED = 0 -# Data generated for the toy example X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], @@ -20,59 +22,58 @@ def test_ros_init(): - # Define a ratio ratio = 'auto' ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) - assert_equal(ros.random_state, RND_SEED) def test_ros_fit_sample(): - """Test the fit sample routine""" - - # Resample the data ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) - - X_gt = np.array([[0.04352327, -0.20515826], [0.20792588, 1.49407907], - [0.22950086, 0.33367433], [0.15490546, 0.3130677], - [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.094035, -2.55298982], [0.92923648, 0.76103773], - [0.47104475, 0.44386323], [0.13347175, 0.12167502], - [0.92923648, 0.76103773], [0.47104475, 0.44386323], - [0.92923648, 0.76103773], [0.47104475, 0.44386323]]) - y_gt = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) + X_gt = np.array([[0.04352327, -0.20515826], + [0.92923648, 0.76103773], + [0.20792588, 1.49407907], + [0.47104475, 0.44386323], + [0.22950086, 0.33367433], + [0.15490546, 0.3130677], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.13347175, 0.12167502], + [0.094035, -2.55298982], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323], + [0.92923648, 0.76103773], + [0.47104475, 0.44386323]]) + y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_ros_fit_sample_half(): - # Resample the data ratio = 0.5 ros = RandomOverSampler(ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, Y) - - X_gt = np.array([[0.04352327, -0.20515826], [0.20792588, 1.49407907], - [0.22950086, 0.33367433], [0.15490546, 0.3130677], - [0.09125309, -0.85409574], [0.12372842, 0.6536186], - [0.094035, -2.55298982], [0.92923648, 0.76103773], - [0.47104475, 0.44386323], [0.13347175, 0.12167502]]) - y_gt = np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0]) + X_gt = np.array([[0.04352327, -0.20515826], + [0.92923648, 0.76103773], + [0.20792588, 1.49407907], + [0.47104475, 0.44386323], + [0.22950086, 0.33367433], + [0.15490546, 0.3130677], + [0.09125309, -0.85409574], + [0.12372842, 0.6536186], + [0.13347175, 0.12167502], + [0.094035, -2.55298982]]) + y_gt = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_multiclass_fit_sample(): - # Make y to be multiclass y = Y.copy() y[5] = 2 y[6] = 2 - - # Resample the data ros = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = ros.fit_sample(X, y) - - # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 5) assert_equal(count_y_res[1], 5) diff --git a/imblearn/over_sampling/tests/test_smote.py b/imblearn/over_sampling/tests/test_smote.py index 77364c99e..3a72c7d37 100644 --- a/imblearn/over_sampling/tests/test_smote.py +++ b/imblearn/over_sampling/tests/test_smote.py @@ -1,4 +1,8 @@ """Test the module SMOTE.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -9,7 +13,6 @@ from imblearn.over_sampling import SMOTE -# Generate a global dataset to use RND_SEED = 0 X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], @@ -33,14 +36,9 @@ def test_smote_wrong_kind(): def test_sample_regular(): - # Create the object kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -61,15 +59,10 @@ def test_sample_regular(): def test_sample_regular_half(): - # Create the object ratio = 0.8 kind = 'regular' smote = SMOTE(ratio=ratio, random_state=RND_SEED, kind=kind) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -88,14 +81,9 @@ def test_sample_regular_half(): def test_sample_borderline1(): - # Create the object kind = 'borderline1' smote = SMOTE(random_state=RND_SEED, kind=kind) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -116,14 +104,9 @@ def test_sample_borderline1(): def test_sample_borderline2(): - # Create the object kind = 'borderline2' smote = SMOTE(random_state=RND_SEED, kind=kind) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -143,14 +126,9 @@ def test_sample_borderline2(): def test_sample_svm(): - # Create the object kind = 'svm' smote = SMOTE(random_state=RND_SEED, kind=kind) - # Fit the data - smote.fit(X, Y) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -170,15 +148,12 @@ def test_sample_svm(): def test_fit_sample_nn_obj(): - # Create the object kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) - smote = SMOTE( - random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) - + smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k, + m_neighbors=nn_m) X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -199,13 +174,10 @@ def test_fit_sample_nn_obj(): def test_sample_regular_with_nn(): - # Create the object kind = 'regular' nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -226,42 +198,33 @@ def test_sample_regular_with_nn(): def test_wrong_nn(): - # Create the object kind = 'borderline1' nn_m = 'rnd' nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) - assert_raises_regex(ValueError, "has to be one of", smote.fit_sample, X, Y) - nn_k = 'rnd' nn_m = NearestNeighbors(n_neighbors=10) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) - assert_raises_regex(ValueError, "has to be one of", smote.fit_sample, X, Y) - kind = 'regular' nn_k = 'rnd' smote = SMOTE(random_state=RND_SEED, kind=kind, k_neighbors=nn_k) - assert_raises_regex(ValueError, "has to be one of", smote.fit_sample, X, Y) def test_sample_regular_with_nn_svm(): - # Create the object kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = SVC(random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) - X_resampled, y_resampled = smote.fit_sample(X, Y) - X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], @@ -281,7 +244,6 @@ def test_sample_regular_with_nn_svm(): def test_sample_regular_wrong_svm(): - # Create the object kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = 'rnd' diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 77667c874..7670a3f19 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -1,3 +1,8 @@ +"""Common tests""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_false from sklearn.utils.testing import _named_check diff --git a/imblearn/tests/test_exceptions.py b/imblearn/tests/test_exceptions.py index 5aee51048..9ce907f47 100644 --- a/imblearn/tests/test_exceptions.py +++ b/imblearn/tests/test_exceptions.py @@ -1,3 +1,9 @@ +"""Test for the exceptions modules""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + + from sklearn.utils.testing import assert_raises_regex from imblearn.exceptions import raise_isinstance_error diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index 3b3df4874..4a67e112b 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -1,6 +1,10 @@ """ Test the pipeline module. """ +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from tempfile import mkdtemp import shutil diff --git a/imblearn/under_sampling/__init__.py b/imblearn/under_sampling/__init__.py index c38534f96..f8bf577da 100644 --- a/imblearn/under_sampling/__init__.py +++ b/imblearn/under_sampling/__init__.py @@ -16,9 +16,14 @@ from .prototype_selection import AllKNN from .prototype_selection import InstanceHardnessThreshold -__all__ = [ - 'RandomUnderSampler', 'TomekLinks', 'ClusterCentroids', 'NearMiss', - 'CondensedNearestNeighbour', 'OneSidedSelection', - 'NeighbourhoodCleaningRule', 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', 'AllKNN', 'InstanceHardnessThreshold' -] +__all__ = ['ClusterCentroids', + 'RandomUnderSampler', + 'InstanceHardnessThreshold', + 'NearMiss', + 'TomekLinks', + 'EditedNearestNeighbours', + 'RepeatedEditedNearestNeighbours', + 'AllKNN', + 'OneSidedSelection', + 'CondensedNearestNeighbour', + 'NeighbourhoodCleaningRule'] diff --git a/imblearn/under_sampling/base.py b/imblearn/under_sampling/base.py new file mode 100644 index 000000000..cb476c19a --- /dev/null +++ b/imblearn/under_sampling/base.py @@ -0,0 +1,25 @@ +""" +Base class for the under-sampling method. +""" +# Authors: Guillaume Lemaitre +# License: MIT + +from ..base import BaseSampler + + +class BaseUnderSampler(BaseSampler): + """Base class for under-sampling algorithms. + + Warning: This class should not be used directly. Use the derive classes + instead. + """ + _sampling_type = 'under-sampling' + + +class BaseCleaningSampler(BaseSampler): + """Base class for under-sampling algorithms. + + Warning: This class should not be used directly. Use the derive classes + instead. + """ + _sampling_type = 'clean-sampling' diff --git a/imblearn/under_sampling/prototype_generation/cluster_centroids.py b/imblearn/under_sampling/prototype_generation/cluster_centroids.py index 62fc49767..42faf7437 100644 --- a/imblearn/under_sampling/prototype_generation/cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/cluster_centroids.py @@ -8,15 +8,13 @@ from __future__ import division, print_function -from collections import Counter - import numpy as np from sklearn.cluster import KMeans -from ...base import BaseMulticlassSampler +from ..base import BaseUnderSampler -class ClusterCentroids(BaseMulticlassSampler): +class ClusterCentroids(BaseUnderSampler): """Perform under-sampling by generating centroids based on clustering methods. @@ -29,42 +27,38 @@ class ClusterCentroids(BaseMulticlassSampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. estimator : object, optional(default=KMeans()) - Pass a `sklearn.cluster.KMeans` estimator. + Pass a :class:`sklearn.cluster.KMeans` estimator. n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- - This class support multi-class. + Supports mutli-class resampling. Examples -------- @@ -72,7 +66,7 @@ class ClusterCentroids(BaseMulticlassSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ - ClusterCentroids # doctest: +NORMALIZE_WHITESPACE +ClusterCentroids # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -96,8 +90,7 @@ def __init__(self, self.n_jobs = n_jobs def _validate_estimator(self): - """Private function to create the NN estimator""" - + """Private function to create the KMeans estimator""" if self.estimator is None: self.estimator_ = KMeans( random_state=self.random_state, n_jobs=self.n_jobs) @@ -107,30 +100,6 @@ def _validate_estimator(self): raise ValueError('`estimator` has to be a KMeans clustering.' ' Got {} instead.'.format(type(self.estimator))) - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(ClusterCentroids, self).fit(X, y) - - self._validate_estimator() - - return self - def _sample(self, X, y): """Resample the dataset. @@ -151,40 +120,27 @@ def _sample(self, X, y): The corresponding label of `X_resampled` """ + self._validate_estimator() - # Compute the number of cluster needed - if self.ratio == 'auto': - num_samples = self.stats_c_[self.min_c_] - else: - num_samples = int(self.stats_c_[self.min_c_] / self.ratio) - - # Set the number of sample for the estimator - self.estimator_.set_params(**{'n_clusters': num_samples}) - - # Start with the minority class - X_min = X[y == self.min_c_] - y_min = y[y == self.min_c_] - - # All the minority class samples will be preserved - X_resampled = X_min.copy() - y_resampled = y_min.copy() - - # Loop over the other classes under picking at random - for key in self.stats_c_.keys(): - - # If the minority class is up, skip it. - if key == self.min_c_: - continue - - # Find the centroids via k-means - self.estimator_.fit(X[y == key]) - centroids = self.estimator_.cluster_centers_ - - # Concatenate to the minority class - X_resampled = np.concatenate((X_resampled, centroids), axis=0) - y_resampled = np.concatenate( - (y_resampled, np.array([key] * num_samples)), axis=0) - - self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) + X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) + y_resampled = np.empty((0, ), dtype=y.dtype) + + for target_class in np.unique(y): + if target_class in self.ratio_.keys(): + n_samples = self.ratio_[target_class] + self.estimator_.set_params(**{'n_clusters': n_samples}) + self.estimator_.fit(X[y == target_class]) + centroids = self.estimator_.cluster_centers_ + + X_resampled = np.concatenate((X_resampled, centroids), axis=0) + y_resampled = np.concatenate( + (y_resampled, np.array([target_class] * n_samples)), + axis=0) + else: + + X_resampled = np.concatenate( + (X_resampled, X[y == target_class]), axis=0) + y_resampled = np.concatenate( + (y_resampled, y[y == target_class]), axis=0) return X_resampled, y_resampled diff --git a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py index eede5be45..a19f8cbb9 100644 --- a/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/prototype_generation/tests/test_cluster_centroids.py @@ -10,9 +10,7 @@ from imblearn.under_sampling import ClusterCentroids -# Generate a global dataset to use RND_SEED = 0 -# Data generated for the toy example X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], @@ -22,24 +20,10 @@ R_TOL = 1e-4 -def test_init(): - # Define a ratio - ratio = 1. - cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) - - assert_equal(cc.ratio, ratio) - - def test_fit_sample_auto(): - # Define the parameter for the under-sampling ratio = 'auto' - - # Create the object cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) - - # Fit and sample X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) @@ -49,15 +33,9 @@ def test_fit_sample_auto(): def test_fit_sample_half(): - # Define the parameter for the under-sampling ratio = .5 - - # Create the object cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED) - - # Fit and sample X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.19220316, 0.32337101], [0.094035, -2.55298982], @@ -69,16 +47,11 @@ def test_fit_sample_half(): def test_multiclass_fit_sample(): - # Make y to be multiclass y = Y.copy() y[5] = 2 y[6] = 2 - - # Resample the data cc = ClusterCentroids(random_state=RND_SEED) X_resampled, y_resampled = cc.fit_sample(X, y) - - # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 2) assert_equal(count_y_res[1], 2) @@ -86,17 +59,12 @@ def test_multiclass_fit_sample(): def test_fit_sample_object(): - # Define the parameter for the under-sampling ratio = 'auto' - - # Create the object cluster = KMeans(random_state=RND_SEED) cc = ClusterCentroids( ratio=ratio, random_state=RND_SEED, estimator=cluster) - # Fit and sample X_resampled, y_resampled = cc.fit_sample(X, Y) - X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.06738818, -0.529627], [0.17901516, 0.69860992], [0.094035, -2.55298982]]) @@ -106,14 +74,9 @@ def test_fit_sample_object(): def test_fit_sample_wrong_object(): - # Define the parameter for the under-sampling ratio = 'auto' - - # Create the object cluster = 'rnd' cc = ClusterCentroids( ratio=ratio, random_state=RND_SEED, estimator=cluster) - - # Fit and sample assert_raises_regex(ValueError, "has to be a KMeans clustering", cc.fit_sample, X, Y) diff --git a/imblearn/under_sampling/prototype_selection/__init__.py b/imblearn/under_sampling/prototype_selection/__init__.py index d4494b322..5fab3d708 100644 --- a/imblearn/under_sampling/prototype_selection/__init__.py +++ b/imblearn/under_sampling/prototype_selection/__init__.py @@ -14,9 +14,13 @@ from .edited_nearest_neighbours import AllKNN from .instance_hardness_threshold import InstanceHardnessThreshold -__all__ = [ - 'RandomUnderSampler', 'TomekLinks', 'NearMiss', - 'CondensedNearestNeighbour', 'OneSidedSelection', - 'NeighbourhoodCleaningRule', 'EditedNearestNeighbours', - 'RepeatedEditedNearestNeighbours', 'AllKNN', 'InstanceHardnessThreshold' -] +__all__ = ['RandomUnderSampler', + 'InstanceHardnessThreshold', + 'NearMiss', + 'TomekLinks', + 'EditedNearestNeighbours', + 'RepeatedEditedNearestNeighbours', + 'AllKNN', + 'OneSidedSelection', + 'CondensedNearestNeighbour', + 'NeighbourhoodCleaningRule'] diff --git a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py index 6d1141807..cc862db9b 100644 --- a/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/prototype_selection/condensed_nearest_neighbour.py @@ -5,7 +5,7 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division from collections import Counter @@ -13,38 +13,57 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import check_random_state -from ...base import BaseMulticlassSampler +from ..base import BaseCleaningSampler +from ...utils.deprecation import deprecate_parameter -class CondensedNearestNeighbour(BaseMulticlassSampler): +class CondensedNearestNeighbour(BaseCleaningSampler): """Class to perform under-sampling based on the condensed nearest neighbour method. Parameters ---------- + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + ``size_ngh`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. - n_neighbors : int or object, optional (default= - KNeighborsClassifier(n_neighbors=1)) - If int, size of the neighbourhood to consider to compute the average - distance to the minority point samples. - If object, an object inherited from - `sklearn.neigbors.KNeighborsClassifier` should be passed. + n_neighbors : int or object, optional (default=\ +KNeighborsClassifier(n_neighbors=1)) + If ``int``, size of the neighbourhood to consider to compute the + average distance to the minority point samples. If object, an object + inherited from :class:`sklearn.neigbors.KNeighborsClassifier` should be + passed. n_seeds_S : int, optional (default=1) Number of samples to extract in order to build the set S. @@ -52,26 +71,11 @@ class CondensedNearestNeighbour(BaseMulticlassSampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- The method is based on [1]_. - This class supports multi-class. + Supports mutli-class resampling. Examples -------- @@ -79,7 +83,7 @@ class CondensedNearestNeighbour(BaseMulticlassSampler): >>> from collections import Counter #doctest: +SKIP >>> from sklearn.datasets import fetch_mldata #doctest: +SKIP >>> from imblearn.under_sampling import \ - CondensedNearestNeighbour #doctest: +SKIP +CondensedNearestNeighbour #doctest: +SKIP >>> pima = fetch_mldata('diabetes_scale') #doctest: +SKIP >>> X, y = pima['data'], pima['target'] #doctest: +SKIP >>> print('Original dataset shape {}'.format(Counter(y))) #doctest: +SKIP @@ -99,6 +103,7 @@ class CondensedNearestNeighbour(BaseMulticlassSampler): """ def __init__(self, + ratio='auto', return_indices=False, random_state=None, size_ngh=None, @@ -106,7 +111,7 @@ def __init__(self, n_seeds_S=1, n_jobs=1): super(CondensedNearestNeighbour, self).__init__( - random_state=random_state) + ratio=ratio, random_state=random_state) self.return_indices = return_indices self.size_ngh = size_ngh self.n_neighbors = n_neighbors @@ -115,6 +120,8 @@ def __init__(self, def _validate_estimator(self): """Private function to create the NN estimator""" + # FIXME: Deprecated in 0.2. To be removed in 0.4 + deprecate_parameter(self, '0.2', 'size_ngh', 'n_neighbors') if self.n_neighbors is None: self.estimator_ = KNeighborsClassifier( @@ -129,30 +136,6 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(CondensedNearestNeighbour, self).fit(X, y) - - self._validate_estimator() - - return self - def _sample(self, X, y): """Resample the dataset. @@ -177,100 +160,100 @@ def _sample(self, X, y): containing the which samples have been selected. """ + self._validate_estimator() random_state = check_random_state(self.random_state) + target_stats = Counter(y) + class_minority = min(target_stats, key=target_stats.get) - # Start with the minority class - X_min = X[y == self.min_c_] - y_min = y[y == self.min_c_] - - # All the minority class samples will be preserved - X_resampled = X_min.copy() - y_resampled = y_min.copy() - - # If we need to offer support for the indices + X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) + y_resampled = np.empty((0, ), dtype=y.dtype) if self.return_indices: - idx_under = np.flatnonzero(y == self.min_c_) - - # Loop over the other classes under picking at random - for key in self.stats_c_.keys(): - - # If the minority class is up, skip it - if key == self.min_c_: - continue - - # Randomly get one sample from the majority class - # Generate the index to select - idx_maj_sample = random_state.randint( - low=0, high=self.stats_c_[key], size=self.n_seeds_S) - maj_sample = X[y == key][idx_maj_sample] - - # Create the set C - One majority samples and all minority - C_x = np.append(X_min, maj_sample, axis=0) - C_y = np.append(y_min, np.array([key] * self.n_seeds_S)) - - # Create the set S - all majority samples - S_x = X[y == key] - S_y = y[y == key] - - # Fit C into the knn - self.estimator_.fit(C_x, C_y) - - good_classif_label = idx_maj_sample.copy() - # Check each sample in S if we keep it or drop it - for idx_sam, (x_sam, y_sam) in enumerate(zip(S_x, S_y)): - - # Do not select sample which are already well classified - if idx_sam in good_classif_label: - continue - - # Classify on S - pred_y = self.estimator_.predict(x_sam.reshape(1, -1)) - - # If the prediction do not agree with the true label - # append it in C_x - if y_sam != pred_y: - # Keep the index for later - idx_maj_sample = np.append(idx_maj_sample, idx_sam) - - # Update C - C_x = np.append(X_min, X[y == key][idx_maj_sample], axis=0) - C_y = np.append(y_min, - np.array([key] * idx_maj_sample.size)) - - # Fit C into the knn - self.estimator_.fit(C_x, C_y) - - # This experimental to speed up the search - # Classify all the element in S and avoid to test the - # well classified elements - pred_S_y = self.estimator_.predict(S_x) - good_classif_label = np.unique( - np.append(idx_maj_sample, - np.flatnonzero(pred_S_y == S_y))) - - # Find the misclassified S_y - sel_x = S_x[idx_maj_sample, :] - sel_y = S_y[idx_maj_sample] - - # The indexes found are relative to the current class, we need to - # find the absolute value - # Build the array with the absolute position - abs_pos = np.flatnonzero(y == key) - idx_maj_sample = abs_pos[idx_maj_sample] - - # If we need to offer support for the indices selected - if self.return_indices: - idx_under = np.concatenate((idx_under, idx_maj_sample), axis=0) - - X_resampled = np.concatenate((X_resampled, sel_x), axis=0) - y_resampled = np.concatenate((y_resampled, sel_y), axis=0) - - self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) + idx_under = np.empty((0, ), dtype=int) + + for target_class in np.unique(y): + if target_class in self.ratio_.keys(): + # Randomly get one sample from the majority class + # Generate the index to select + idx_maj_sample = random_state.randint( + low=0, high=target_stats[target_class], + size=self.n_seeds_S) + maj_sample = X[y == target_class][idx_maj_sample] + + # Create the set C - One majority samples and all minority + C_x = np.append(X[y == class_minority], maj_sample, axis=0) + C_y = np.append(y[y == class_minority], + np.array([target_class] * self.n_seeds_S)) + + # Create the set S - all majority samples + S_x = X[y == target_class] + S_y = y[y == target_class] + + # fit knn on C + self.estimator_.fit(C_x, C_y) + + good_classif_label = idx_maj_sample.copy() + # Check each sample in S if we keep it or drop it + for idx_sam, (x_sam, y_sam) in enumerate(zip(S_x, S_y)): + + # Do not select sample which are already well classified + if idx_sam in good_classif_label: + continue + + # Classify on S + pred_y = self.estimator_.predict(x_sam.reshape(1, -1)) + + # If the prediction do not agree with the true label + # append it in C_x + if y_sam != pred_y: + # Keep the index for later + idx_maj_sample = np.append(idx_maj_sample, idx_sam) + + # Update C + C_x = np.append(X[y == class_minority], + X[y == target_class][idx_maj_sample], + axis=0) + C_y = np.append(y[y == class_minority], + np.array([target_class] * + idx_maj_sample.size)) + + # fit a knn on C + self.estimator_.fit(C_x, C_y) + + # This experimental to speed up the search + # Classify all the element in S and avoid to test the + # well classified elements + pred_S_y = self.estimator_.predict(S_x) + good_classif_label = np.unique( + np.append(idx_maj_sample, + np.flatnonzero(pred_S_y == S_y))) + + # Find the misclassified S_y + sel_x = S_x[idx_maj_sample, :] + sel_y = S_y[idx_maj_sample] + + # The indexes found are relative to the current class, we need + # to find the absolute value Build the array with the absolute + # position + abs_pos = np.flatnonzero(y == target_class) + idx_maj_sample = abs_pos[idx_maj_sample] + + # If we need to offer support for the indices selected + if self.return_indices: + idx_under = np.concatenate((idx_under, idx_maj_sample), + axis=0) + X_resampled = np.concatenate((X_resampled, sel_x), axis=0) + y_resampled = np.concatenate((y_resampled, sel_y), axis=0) + else: + X_resampled = np.concatenate( + (X_resampled, X[y == target_class]), axis=0) + y_resampled = np.concatenate( + (y_resampled, y[y == target_class]), axis=0) + if self.return_indices: + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)), axis=0) - # Check if the indices of the samples selected should be returned too if self.return_indices: - # Return the indices of interest return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled diff --git a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py index 0a56dab48..cc4b8f535 100644 --- a/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/edited_nearest_neighbours.py @@ -7,80 +7,84 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division from collections import Counter import numpy as np from scipy.stats import mode -from ...base import BaseMulticlassSampler +from ..base import BaseCleaningSampler from ...utils import check_neighbors_object +from ...utils.deprecation import deprecate_parameter SEL_KIND = ('all', 'mode') -class EditedNearestNeighbours(BaseMulticlassSampler): +class EditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the edited nearest neighbour method. Parameters ---------- + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + ``size_ngh`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. n_neighbors : int or object, optional (default=3) - If object, size of the neighbourhood to consider to compute the average - distance to the minority point samples. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + If ``int``, size of the neighbourhood to consider to compute the + average distance to the minority point samples. If object, an + estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. kind_sel : str, optional (default='all') Strategy to use in order to exclude samples. - - If 'all', all neighbours will have to agree with the samples of - interest to not be excluded. - - If 'mode', the majority vote of the neighbours will be used in - order to exclude a sample. + - If ``'all'``, all neighbours will have to agree with the samples of + interest to not be excluded. + - If ``'mode'``, the majority vote of the neighbours will be used in + order to exclude a sample. n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- The method is based on [1]_. - This class supports multi-class. + Supports mutli-class resampling. Examples -------- @@ -108,6 +112,7 @@ class EditedNearestNeighbours(BaseMulticlassSampler): """ def __init__(self, + ratio='auto', return_indices=False, random_state=None, size_ngh=None, @@ -115,6 +120,7 @@ def __init__(self, kind_sel='all', n_jobs=1): super(EditedNearestNeighbours, self).__init__( + ratio=ratio, random_state=random_state) self.return_indices = return_indices self.size_ngh = size_ngh @@ -122,31 +128,17 @@ def __init__(self, self.kind_sel = kind_sel self.n_jobs = n_jobs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ + def _validate_estimator(self): + """Validate the estimator created in the ENN.""" + # FIXME: Deprecated in 0.2. To be removed in 0.4 + deprecate_parameter(self, '0.2', 'size_ngh', 'n_neighbors') - super(EditedNearestNeighbours, self).fit(X, y) self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, additional_neighbor=1) - # set the number of jobs self.nn_.set_params(**{'n_jobs': self.n_jobs}) - return self + if self.kind_sel not in SEL_KIND: + raise NotImplementedError def _sample(self, X, y): """Resample the dataset. @@ -172,145 +164,117 @@ def _sample(self, X, y): containing the which samples have been selected. """ + self._validate_estimator() - if self.kind_sel not in SEL_KIND: - raise NotImplementedError - - # Start with the minority class - X_min = X[y == self.min_c_] - y_min = y[y == self.min_c_] - - # All the minority class samples will be preserved - X_resampled = X_min.copy() - y_resampled = y_min.copy() - - # If we need to offer support for the indices + X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) + y_resampled = np.empty((0, ), dtype=y.dtype) if self.return_indices: - idx_under = np.flatnonzero(y == self.min_c_) + idx_under = np.empty((0, ), dtype=int) - # Fit the data self.nn_.fit(X) - # Loop over the other classes under picking at random - for key in self.stats_c_.keys(): - - # If the minority class is up, skip it - if key == self.min_c_: - continue - - # Get the sample of the current class - sub_samples_x = X[y == key] - sub_samples_y = y[y == key] - - # Find the NN for the current class - nnhood_idx = self.nn_.kneighbors( - sub_samples_x, return_distance=False)[:, 1:] - - # Get the label of the corresponding to the index - nnhood_label = y[nnhood_idx] - - # Check which one are the same label than the current class - # Make the majority vote - if self.kind_sel == 'mode': - nnhood_label, _ = mode(nnhood_label, axis=1) - nnhood_bool = (np.ravel(nnhood_label) == sub_samples_y) - elif self.kind_sel == 'all': - nnhood_label = (nnhood_label == key) - nnhood_bool = np.all(nnhood_label, axis=1) + for target_class in np.unique(y): + if target_class in self.ratio_.keys(): + X_class = X[y == target_class] + y_class = y[y == target_class] + nnhood_idx = self.nn_.kneighbors( + X_class, return_distance=False)[:, 1:] + nnhood_label = y[nnhood_idx] + if self.kind_sel == 'mode': + nnhood_label, _ = mode(nnhood_label, axis=1) + nnhood_bool = np.ravel(nnhood_label) == y_class + elif self.kind_sel == 'all': + nnhood_label = nnhood_label == target_class + nnhood_bool = np.all(nnhood_label, axis=1) + index_target_class = np.flatnonzero(nnhood_bool) else: - raise NotImplementedError - - # Get the samples which agree all together - sel_x = sub_samples_x[np.flatnonzero(nnhood_bool), :] - sel_y = sub_samples_y[np.flatnonzero(nnhood_bool)] - - # If we need to offer support for the indices selected + index_target_class = slice(None) + + X_resampled = np.concatenate( + (X_resampled, X[y == target_class][index_target_class]), + axis=0) + y_resampled = np.concatenate( + (y_resampled, y[y == target_class][index_target_class]), + axis=0) if self.return_indices: - idx_tmp = np.flatnonzero(y == key)[np.flatnonzero(nnhood_bool)] - idx_under = np.concatenate((idx_under, idx_tmp), axis=0) - - self.logger.debug('Shape of the selected feature: %s', sel_x.shape) - self.logger.debug('Shape of current features: %s', - X_resampled.shape) - - X_resampled = np.concatenate((X_resampled, sel_x), axis=0) - y_resampled = np.concatenate((y_resampled, sel_y), axis=0) - - self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) - # Check if the indices of the samples selected should be returned too if self.return_indices: - # Return the indices of interest return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled -class RepeatedEditedNearestNeighbours(BaseMulticlassSampler): +class RepeatedEditedNearestNeighbours(BaseCleaningSampler): """Class to perform under-sampling based on the repeated edited nearest neighbour method. Parameters ---------- + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated: 0.2 + ``size_ngh`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. n_neighbors : int or object, optional (default=3) - If int, size of the neighbourhood to consider to compute the average - distance to the minority point samples. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + If ``int``, size of the neighbourhood to consider to compute the + average distance to the minority point samples. If object, an + estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. + + max_iter : int, optional (default=100) + Maximum number of iterations of the edited nearest neighbours + algorithm for a single run. kind_sel : str, optional (default='all') Strategy to use in order to exclude samples. - - If 'all', all neighbours will have to agree with the samples of - interest to not be excluded. - - If 'mode', the majority vote of the neighbours will be used in - order to exclude a sample. + - If ``'all'``, all neighbours will have to agree with the samples of + interest to not be excluded. + - If ``'mode'``, the majority vote of the neighbours will be used in + order to exclude a sample. n_jobs : int, optional (default=-1) The number of thread to open when it is possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - max_iter : int, optional (default=100) - Maximum number of iterations of the edited nearest neighbours - algorithm for a single run. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- The method is based on [1]_. - This class supports multi-class. + Supports mutli-class resampling. Examples -------- @@ -338,6 +302,7 @@ class RepeatedEditedNearestNeighbours(BaseMulticlassSampler): """ def __init__(self, + ratio='auto', return_indices=False, random_state=None, size_ngh=None, @@ -346,7 +311,7 @@ def __init__(self, kind_sel='all', n_jobs=-1): super(RepeatedEditedNearestNeighbours, self).__init__( - random_state=random_state) + ratio=ratio, random_state=random_state) self.return_indices = return_indices self.size_ngh = size_ngh self.n_neighbors = n_neighbors @@ -356,39 +321,19 @@ def __init__(self, def _validate_estimator(self): """Private function to create the NN estimator""" + if self.max_iter < 2: + raise ValueError('max_iter must be greater than 1.' + ' Got {} instead.'.format(type(self.max_iter))) - self.enn_ = EditedNearestNeighbours( - return_indices=self.return_indices, - random_state=self.random_state, - n_neighbors=self.n_neighbors, - kind_sel=self.kind_sel, - n_jobs=self.n_jobs) - - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(RepeatedEditedNearestNeighbours, self).fit(X, y) - - self._validate_estimator() - - self.enn_.fit(X, y) + self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, + additional_neighbor=1) - return self + self.enn_ = EditedNearestNeighbours(ratio=self.ratio, + return_indices=self.return_indices, + random_state=self.random_state, + n_neighbors=self.nn_, + kind_sel=self.kind_sel, + n_jobs=self.n_jobs) def _sample(self, X, y): """Resample the dataset. @@ -415,24 +360,18 @@ def _sample(self, X, y): """ - if self.kind_sel not in SEL_KIND: - raise NotImplementedError - - if self.max_iter < 2: - raise ValueError('max_iter must be greater than 1.' - ' Got {} instead.'.format(type(self.max_iter))) + self._validate_estimator() X_, y_ = X, y - if self.return_indices: idx_under = np.arange(X.shape[0], dtype=int) + target_stats = Counter(y) + class_minority = min(target_stats, key=target_stats.get) prev_len = y.shape[0] for n_iter in range(self.max_iter): - self.logger.debug('Apply ENN iteration #%s', n_iter + 1) - prev_len = y_.shape[0] if self.return_indices: X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_) @@ -450,113 +389,100 @@ def _sample(self, X, y): # Case 2 stats_enn = Counter(y_enn) - self.logger.debug('Current ENN stats: %s', stats_enn) - # Get the number of samples in the non-minority classes count_non_min = np.array([ val for val, key in zip(stats_enn.values(), stats_enn.keys()) - if key != self.min_c_ + if key != class_minority ]) - self.logger.debug('Number of samples in the non-majority' - ' classes: %s', count_non_min) - # Check the minority stop to be the minority - b_min_bec_maj = np.any(count_non_min < self.stats_c_[self.min_c_]) + b_min_bec_maj = np.any(count_non_min < + target_stats[class_minority]) # Case 3 - b_remove_maj_class = (len(stats_enn) < len(self.stats_c_)) + b_remove_maj_class = (len(stats_enn) < len(target_stats)) + + X_, y_, = X_enn, y_enn + if self.return_indices: + idx_under = idx_under[idx_enn] if b_conv or b_min_bec_maj or b_remove_maj_class: - # If this is a normal convergence, get the last data if b_conv: if self.return_indices: X_, y_, = X_enn, y_enn idx_under = idx_under[idx_enn] else: X_, y_, = X_enn, y_enn - # Log the variables to explain the stop of the algorithm - self.logger.debug('RENN converged: %s', b_conv) - self.logger.debug('RENN minority become majority: %s', - b_min_bec_maj) - self.logger.debug('RENN remove one class: %s', - b_remove_maj_class) break - # Update the data for the next iteration - X_, y_, = X_enn, y_enn - if self.return_indices: - idx_under = idx_under[idx_enn] - - self.logger.info('Under-sampling performed: %s', Counter(y_)) - X_resampled, y_resampled = X_, y_ - # Check if the indices of the samples selected should be returned too if self.return_indices: - # Return the indices of interest return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled -class AllKNN(BaseMulticlassSampler): +class AllKNN(BaseCleaningSampler): """Class to perform under-sampling based on the AllKNN method. Parameters ---------- + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + ``size_ngh`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. n_neighbors : int or object, optional (default=3) - If int, size of the neighbourhood to consider to compute the average - distance to the minority point samples. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + If ``int``, size of the neighbourhood to consider to compute the + average distance to the minority point samples. If object, an + estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. kind_sel : str, optional (default='all') Strategy to use in order to exclude samples. - - If 'all', all neighbours will have to agree with the samples of - interest to not be excluded. - - If 'mode', the majority vote of the neighbours will be used in - order to exclude a sample. + - If ``'all'``, all neighbours will have to agree with the samples of + interest to not be excluded. + - If ``'mode'``, the majority vote of the neighbours will be used in + order to exclude a sample. n_jobs : int, optional (default=-1) The number of thread to open when it is possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- The method is based on [1]_. - This class supports multi-class. + Supports mutli-class resampling. Examples -------- @@ -564,7 +490,7 @@ class AllKNN(BaseMulticlassSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ - AllKNN # doctest: +NORMALIZE_WHITESPACE +AllKNN # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -584,13 +510,14 @@ class AllKNN(BaseMulticlassSampler): """ def __init__(self, + ratio='auto', return_indices=False, random_state=None, size_ngh=None, n_neighbors=3, kind_sel='all', n_jobs=-1): - super(AllKNN, self).__init__(random_state=random_state) + super(AllKNN, self).__init__(ratio=ratio, random_state=random_state) self.return_indices = return_indices self.size_ngh = size_ngh self.n_neighbors = n_neighbors @@ -598,39 +525,19 @@ def __init__(self, self.n_jobs = n_jobs def _validate_estimator(self): - """Private function to create the NN estimator""" - - self.enn_ = EditedNearestNeighbours( - return_indices=self.return_indices, - random_state=self.random_state, - n_neighbors=self.n_neighbors, - kind_sel=self.kind_sel, - n_jobs=self.n_jobs) - - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - super(AllKNN, self).fit(X, y) - - self._validate_estimator() + """Create objects required by AllKNN""" + if self.kind_sel not in SEL_KIND: + raise NotImplementedError - self.enn_.fit(X, y) + self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, + additional_neighbor=1) - return self + self.enn_ = EditedNearestNeighbours(ratio=self.ratio, + return_indices=self.return_indices, + random_state=self.random_state, + n_neighbors=self.nn_, + kind_sel=self.kind_sel, + n_jobs=self.n_jobs) def _sample(self, X, y): """Resample the dataset. @@ -656,18 +563,16 @@ def _sample(self, X, y): containing the which samples have been selected. """ - - if self.kind_sel not in SEL_KIND: - raise NotImplementedError + self._validate_estimator() X_, y_ = X, y + target_stats = Counter(y) + class_minority = min(target_stats, key=target_stats.get) if self.return_indices: idx_under = np.arange(X.shape[0], dtype=int) - for curr_size_ngh in range(1, self.enn_.nn_.n_neighbors): - self.logger.debug('Apply ENN n_neighbors #%s', curr_size_ngh) - # updating ENN size_ngh + for curr_size_ngh in range(1, self.nn_.n_neighbors): self.enn_.n_neighbors = curr_size_ngh if self.return_indices: @@ -681,40 +586,26 @@ def _sample(self, X, y): # 2. If one of the class is disappearing # Case 1 stats_enn = Counter(y_enn) - self.logger.debug('Current ENN stats: %s', stats_enn) - # Get the number of samples in the non-minority classes count_non_min = np.array([ val for val, key in zip(stats_enn.values(), stats_enn.keys()) - if key != self.min_c_ + if key != class_minority ]) - self.logger.debug('Number of samples in the non-majority' - ' classes: %s', count_non_min) - # Check the minority stop to be the minority - b_min_bec_maj = np.any(count_non_min < self.stats_c_[self.min_c_]) + b_min_bec_maj = np.any(count_non_min < + target_stats[class_minority]) # Case 2 - b_remove_maj_class = (len(stats_enn) < len(self.stats_c_)) - - if b_min_bec_maj or b_remove_maj_class: - # Log the variables to explain the stop of the algorithm - self.logger.debug('AllKNN minority become majority: %s', - b_min_bec_maj) - self.logger.debug('AllKNN remove one class: %s', - b_remove_maj_class) - break + b_remove_maj_class = (len(stats_enn) < len(target_stats)) - # Update the data for the next iteration X_, y_, = X_enn, y_enn if self.return_indices: idx_under = idx_under[idx_enn] - self.logger.info('Under-sampling performed: %s', Counter(y_)) + if b_min_bec_maj or b_remove_maj_class: + break X_resampled, y_resampled = X_, y_ - # Check if the indices of the samples selected should be returned too if self.return_indices: - # Return the indices of interest return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled diff --git a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py index f3f50b2c2..1b4f1c208 100644 --- a/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py +++ b/imblearn/under_sampling/prototype_selection/instance_hardness_threshold.py @@ -6,7 +6,7 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division import warnings from collections import Counter @@ -17,7 +17,7 @@ from sklearn.ensemble import RandomForestClassifier from sklearn.externals.six import string_types -from ...base import BaseBinarySampler +from ..base import BaseCleaningSampler def _get_cv_splits(X, y, cv, random_state): @@ -33,39 +33,52 @@ def _get_cv_splits(X, y, cv, random_state): return cv_iterator -class InstanceHardnessThreshold(BaseBinarySampler): +class InstanceHardnessThreshold(BaseCleaningSampler): """Class to perform under-sampling based on the instance hardness threshold. Parameters ---------- estimator : object, optional (default=RandomForestClassifier()) - Classifier to be used to estimate instance hardness of the samples. - By default a RandomForestClassifer will be used. - If str, the choices using a string are the following: 'knn', - 'decision-tree', 'random-forest', 'adaboost', 'gradient-boosting' - and 'linear-svm'. - If object, an estimator inherited from `sklearn.base.ClassifierMixin` - and having an attribute `predict_proba`. - - NOTE: `estimator` as a string object is deprecated from 0.2 and will be - replaced in 0.4. Use `ClassifierMixin` object instead. - - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + Classifier to be used to estimate instance hardness of the samples. By + default a :class:`sklearn.ensemble.RandomForestClassifer` will be used. + If ``str``, the choices using a string are the following: ``'knn'``, + ``'decision-tree'``, ``'random-forest'``, ``'adaboost'``, + ``'gradient-boosting'`` and ``'linear-svm'``. If object, an estimator + inherited from :class:`sklearn.base.ClassifierMixin` and having an + attribute :func:`predict_proba`. + + .. deprecated:: 0.2 + ``estimator`` as a string object is deprecated from 0.2 and will be + replaced in 0.4. Use :class:`sklearn.base.ClassifierMixin` object + instead. + + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. cv : int, optional (default=5) Number of folds to be used when estimating samples' instance hardness. @@ -76,33 +89,16 @@ class InstanceHardnessThreshold(BaseBinarySampler): **kwargs: Option for the different classifier. - NOTE: `**kwargs` has been deprecated from 0.2 and will be replaced in - 0.4. Use `ClassifierMixin` object instead to pass parameter associated - to an estimator. - - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - cv : int, optional (default=5) - Number of folds used when estimating samples' instance hardness. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. + .. deprecated:: 0.2 + ``**kwargs`` has been deprecated from 0.2 and will be replaced in + 0.4. Use :class:`sklearn.base.ClassifierMixin` object instead to + pass parameter associated to an estimator. Notes ----- The method is based on [1]_. - This class does not support multi-class. + Supports mutli-class resampling. Examples -------- @@ -191,30 +187,6 @@ def _validate_estimator(self): raise ValueError('Invalid parameter `estimator`. Got {}.'.format( type(self.estimator))) - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(InstanceHardnessThreshold, self).fit(X, y) - - self._validate_estimator() - - return self - def _sample(self, X, y): """Resample the dataset. @@ -239,10 +211,10 @@ def _sample(self, X, y): containing the which samples have been selected. """ + self._validate_estimator() - # Create the different folds + target_stats = Counter(y) skf = _get_cv_splits(X, y, self.cv, self.random_state) - probabilities = np.zeros(y.shape[0], dtype=float) for train_index, test_index in skf: @@ -258,28 +230,34 @@ def _sample(self, X, y): for l, c in enumerate(y_test) ] - # Compute the number of cluster needed - if self.ratio == 'auto': - num_samples = self.stats_c_[self.min_c_] - else: - num_samples = int(self.stats_c_[self.min_c_] / self.ratio) - - # Find the percentile corresponding to the top num_samples - threshold = np.percentile( - probabilities[y != self.min_c_], - (1. - (num_samples / self.stats_c_[self.maj_c_])) * 100.) - - mask = np.logical_or(probabilities >= threshold, y == self.min_c_) - - # Sample the data - X_resampled = X[mask] - y_resampled = y[mask] - - self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) + X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) + y_resampled = np.empty((0, ), dtype=y.dtype) + if self.return_indices: + idx_under = np.empty((0, ), dtype=int) + + for target_class in np.unique(y): + if target_class in self.ratio_.keys(): + n_samples = self.ratio_[target_class] + threshold = np.percentile( + probabilities[y == target_class], + (1. - (n_samples / target_stats[target_class])) * 100.) + index_target_class = np.flatnonzero( + probabilities[y == target_class] >= threshold) + else: + index_target_class = slice(None) + + X_resampled = np.concatenate( + (X_resampled, X[y == target_class][index_target_class]), + axis=0) + y_resampled = np.concatenate( + (y_resampled, y[y == target_class][index_target_class]), + axis=0) + if self.return_indices: + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) - # If we need to offer support for the indices if self.return_indices: - idx_under = np.flatnonzero(mask) return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled diff --git a/imblearn/under_sampling/prototype_selection/nearmiss.py b/imblearn/under_sampling/prototype_selection/nearmiss.py index 59b9ba6c3..c7622ee67 100644 --- a/imblearn/under_sampling/prototype_selection/nearmiss.py +++ b/imblearn/under_sampling/prototype_selection/nearmiss.py @@ -4,95 +4,93 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division import warnings from collections import Counter import numpy as np -from ...base import BaseMulticlassSampler +from ..base import BaseUnderSampler from ...utils import check_neighbors_object +from ...utils.deprecation import deprecate_parameter -class NearMiss(BaseMulticlassSampler): +class NearMiss(BaseUnderSampler): """Class to perform under-sampling based on NearMiss methods. Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. version : int, optional (default=1) - Version of the NearMiss to use. Possible values - are 1, 2 or 3. + Version of the NearMiss to use. Possible values are 1, 2 or 3. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + ``size_ngh`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. n_neighbors : int or object, optional (default=3) - If int, size of the neighbourhood to consider to compute the average - distance to the minority point samples. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + If ``int``, size of the neighbourhood to consider to compute the + average distance to the minority point samples. If object, an + estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. ver3_samp_ngh : int, optional (default=3) NearMiss-3 algorithm start by a phase of re-sampling. This parameter correspond to the number of neighbours selected create the sub_set in which the selection will be performed. - NOTE: `ver3_samp_ngh` is deprecated from 0.2 and will be replaced - in 0.4. Use ``n_neighbors_ver3`` instead. + .. deprecated:: 0.2 + ``ver3_samp_ngh`` is deprecated from 0.2 and will be replaced + in 0.4. Use ``n_neighbors_ver3`` instead. n_neighbors_ver3 : int or object, optional (default=3) - If int, NearMiss-3 algorithm start by a phase of re-sampling. This - parameter correspond to the number of neighbours selected - create the sub_set in which the selection will be performed. - If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + If ``int``, NearMiss-3 algorithm start by a phase of re-sampling. This + parameter correspond to the number of neighbours selected create the + subset in which the selection will be performed. If object, an + estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- The methods are based on [1]_. - The class support multi-classes. + Supports mutli-class resampling. Examples -------- @@ -100,7 +98,7 @@ class NearMiss(BaseMulticlassSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ - NearMiss # doctest: +NORMALIZE_WHITESPACE +NearMiss # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -183,11 +181,6 @@ def _selection_dist_based(self, # Compute the distance considering the farthest neighbour dist_avg_vec = np.sum(dist_vec[:, -self.nn_.n_neighbors:], axis=1) - self.logger.debug('The size of the distance matrix is %s', - dist_vec.shape) - self.logger.debug('The size of the samples that can be selected is %s', - X[y == key].shape) - if dist_vec.shape[0] != X[y == key].shape[0]: raise RuntimeError('The samples to be selected do not correspond' ' to the distance matrix given. Ensure that' @@ -216,53 +209,27 @@ def _selection_dist_based(self, ' will be returned.') # Select the desired number of samples - sel_idx = sorted_idx[:num_samples] - - return (X[y == key][sel_idx], y[y == key][sel_idx], - np.flatnonzero(y == key)[sel_idx]) + return sorted_idx[:num_samples] def _validate_estimator(self): """Private function to create the NN estimator""" + # FIXME: Deprecated in 0.2. To be removed in 0.4. + deprecate_parameter(self, '0.2', 'size_ngh', 'n_neighbors') if self.version == 3: - # Announce deprecation if needed - if self.ver3_samp_ngh is not None: - warnings.warn('`ver3_samp_ngh` will be replaced in version' - ' 0.4. Use `n_neighbors_ver3` instead.', - DeprecationWarning) - self.n_neighbors_ver3 = self.ver3_samp_ngh - - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. + deprecate_parameter(self, '0.2', 'ver3_samp_ngh', + 'n_neighbors_ver3') - """ - - super(NearMiss, self).fit(X, y) self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors) - # set the number of jobs self.nn_.set_params(**{'n_jobs': self.n_jobs}) - # kept for deprecation purpose it will create the n_neighbors_ver3 - self._validate_estimator() if self.version == 3: self.nn_ver3_ = check_neighbors_object('n_neighbors_ver3', self.n_neighbors_ver3) - # set the number of jobs self.nn_ver3_.set_params(**{'n_jobs': self.n_jobs}) - return self + if self.version not in (1, 2, 3): + raise ValueError('Parameter `version` must be 1, 2 or 3, got' + ' {}'.format(self.version)) def _sample(self, X, y): """Resample the dataset. @@ -288,109 +255,67 @@ def _sample(self, X, y): containing the which samples have been selected. """ + self._validate_estimator() - # Assign the parameter of the element of this class - # Check that the version asked is implemented - if self.version not in (1, 2, 3): - raise ValueError('Parameter `version` must be 1, 2 or 3, got' - ' {}'.format(self.version)) - - # Start with the minority class - X_min = X[y == self.min_c_] - y_min = y[y == self.min_c_] - - # All the minority class samples will be preserved - X_resampled = X_min.copy() - y_resampled = y_min.copy() - - # Compute the number of cluster needed - if self.ratio == 'auto': - num_samples = self.stats_c_[self.min_c_] - else: - num_samples = int(self.stats_c_[self.min_c_] / self.ratio) - - # If we need to offer support for the indices + X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) + y_resampled = np.empty((0, ), dtype=y.dtype) if self.return_indices: - idx_under = np.flatnonzero(y == self.min_c_) - - # Fit the minority class since that we want to know the distance - # to these point - self.nn_.fit(X[y == self.min_c_]) - - # Loop over the other classes under picking at random - for key in self.stats_c_.keys(): - - # If the minority class is up, skip it - if key == self.min_c_: - continue - - # Get the samples corresponding to the current class - sub_samples_x = X[y == key] - sub_samples_y = y[y == key] - - if self.version == 1: - # Find the NN - dist_vec, idx_vec = self.nn_.kneighbors( - sub_samples_x, n_neighbors=self.nn_.n_neighbors) - - # Select the right samples - sel_x, sel_y, idx_tmp = self._selection_dist_based( - X, y, dist_vec, num_samples, key, sel_strategy='nearest') - - elif self.version == 2: - # Find the NN - dist_vec, idx_vec = self.nn_.kneighbors( - sub_samples_x, n_neighbors=self.stats_c_[self.min_c_]) - - # Select the right samples - sel_x, sel_y, idx_tmp = self._selection_dist_based( - X, y, dist_vec, num_samples, key, sel_strategy='nearest') - - elif self.version == 3: - # We need a new NN object to fit the current class - self.nn_ver3_.fit(sub_samples_x) - - # Find the set of NN to the minority class - dist_vec, idx_vec = self.nn_ver3_.kneighbors(X_min) - - # Create the subset containing the samples found during the NN - # search. Linearize the indexes and remove the double values - idx_vec_farthest = np.unique(idx_vec.reshape(-1)) - - # Create the subset - sub_samples_x = sub_samples_x[idx_vec_farthest, :] - sub_samples_y = sub_samples_y[idx_vec_farthest] - - # Compute the NN considering the current class - dist_vec, idx_vec = self.nn_.kneighbors( - sub_samples_x, n_neighbors=self.nn_.n_neighbors) - - sel_x, sel_y, idx_tmp = self._selection_dist_based( - sub_samples_x, - sub_samples_y, - dist_vec, - num_samples, - key, - sel_strategy='farthest') - - # idx_tmp is relative to the feature selected in the - # previous step and we need to find the indirection - idx_tmp = np.flatnonzero(y == key)[idx_vec_farthest[idx_tmp]] + idx_under = np.empty((0, ), dtype=int) + + target_stats = Counter(y) + class_minority = min(target_stats, key=target_stats.get) + + self.nn_.fit(X[y == class_minority]) + + for target_class in np.unique(y): + if target_class in self.ratio_.keys(): + n_samples = self.ratio_[target_class] + X_class = X[y == target_class] + y_class = y[y == target_class] + + if self.version == 1: + dist_vec, idx_vec = self.nn_.kneighbors( + X_class, n_neighbors=self.nn_.n_neighbors) + index_target_class = self._selection_dist_based( + X, y, dist_vec, n_samples, target_class, + sel_strategy='nearest') + elif self.version == 2: + dist_vec, idx_vec = self.nn_.kneighbors( + X_class, n_neighbors=target_stats[class_minority]) + index_target_class = self._selection_dist_based( + X, y, dist_vec, n_samples, target_class, + sel_strategy='nearest') + elif self.version == 3: + self.nn_ver3_.fit(X_class) + dist_vec, idx_vec = self.nn_ver3_.kneighbors( + X[y == class_minority]) + idx_vec_farthest = np.unique(idx_vec.reshape(-1)) + X_class_selected = X_class[idx_vec_farthest, :] + y_class_selected = y_class[idx_vec_farthest] + + dist_vec, idx_vec = self.nn_.kneighbors( + X_class_selected, n_neighbors=self.nn_.n_neighbors) + index_target_class = self._selection_dist_based( + X_class_selected, y_class_selected, dist_vec, + n_samples, target_class, sel_strategy='farthest') + # idx_tmp is relative to the feature selected in the + # previous step and we need to find the indirection + index_target_class = idx_vec_farthest[index_target_class] else: - raise NotImplementedError - - # If we need to offer support for the indices selected + index_target_class = slice(None) + + X_resampled = np.concatenate( + (X_resampled, X[y == target_class][index_target_class]), + axis=0) + y_resampled = np.concatenate( + (y_resampled, y[y == target_class][index_target_class]), + axis=0) if self.return_indices: - idx_under = np.concatenate((idx_under, idx_tmp), axis=0) - - X_resampled = np.concatenate((X_resampled, sel_x), axis=0) - y_resampled = np.concatenate((y_resampled, sel_y), axis=0) - - self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) - # Check if the indices of the samples selected should be returned too if self.return_indices: - # Return the indices of interest return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled diff --git a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py index 5e758e6fa..fa998e6cf 100644 --- a/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/neighbourhood_cleaning_rule.py @@ -9,62 +9,69 @@ from collections import Counter import numpy as np +from scipy.stats import mode -from ...base import BaseMulticlassSampler +from ..base import BaseCleaningSampler +from .edited_nearest_neighbours import EditedNearestNeighbours from ...utils import check_neighbors_object +SEL_KIND = ('all', 'mode') -class NeighbourhoodCleaningRule(BaseMulticlassSampler): + +class NeighbourhoodCleaningRule(BaseCleaningSampler): """Class performing under-sampling based on the neighbourhood cleaning rule. Parameters ---------- + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + ``size_ngh`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. n_neighbors : int or object, optional (default=3) - If int, size of the neighbourhood to consider in order to make + If ``int``, size of the neighbourhood to consider in order to make the comparison between each samples and their NN. If object, an estimator that inherits from - `sklearn.neighbors.base.KNeighborsMixin` that will be used to find - the k_neighbors. + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- - This class support multi-class. + Supports mutli-class resampling. Examples -------- @@ -72,7 +79,7 @@ class NeighbourhoodCleaningRule(BaseMulticlassSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ - NeighbourhoodCleaningRule # doctest: +NORMALIZE_WHITESPACE +NeighbourhoodCleaningRule # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -81,7 +88,7 @@ class NeighbourhoodCleaningRule(BaseMulticlassSampler): >>> ncr = NeighbourhoodCleaningRule(random_state=42) >>> X_res, y_res = ncr.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format(Counter(y_res))) - Resampled dataset shape Counter({1: 889, 0: 100}) + Resampled dataset shape Counter({1: 877, 0: 100}) References ---------- @@ -91,42 +98,37 @@ class NeighbourhoodCleaningRule(BaseMulticlassSampler): """ def __init__(self, + ratio='auto', return_indices=False, random_state=None, size_ngh=None, n_neighbors=3, + kind_sel='all', + threshold_cleaning=0.5, n_jobs=1): super(NeighbourhoodCleaningRule, self).__init__( - random_state=random_state) + ratio=ratio, random_state=random_state) self.return_indices = return_indices self.size_ngh = size_ngh self.n_neighbors = n_neighbors + self.kind_sel = kind_sel + self.threshold_cleaning = threshold_cleaning self.n_jobs = n_jobs - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(NeighbourhoodCleaningRule, self).fit(X, y) - self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors) - # set the number of jobs + def _validate_estimator(self): + """Create the objects required by NCR.""" + # FIXME: Deprecated from 0.2. To be removed in 0.4. + self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, + additional_neighbor=1) self.nn_.set_params(**{'n_jobs': self.n_jobs}) - return self + if self.kind_sel not in SEL_KIND: + raise NotImplementedError + + if self.threshold_cleaning > 1 or self.threshold_cleaning < 0: + raise ValueError("'threshold_cleaning' is a value between 0 and 1." + " Got {} instead.".format( + self.threshold_cleaning)) def _sample(self, X, y): """Resample the dataset. @@ -152,79 +154,51 @@ def _sample(self, X, y): containing the which samples have been selected. """ - - # Start with the minority class - X_min = X[y == self.min_c_] - y_min = y[y == self.min_c_] - - # All the minority class samples will be preserved - X_resampled = X_min.copy() - y_resampled = y_min.copy() - - # If we need to offer support for the indices - if self.return_indices: - idx_under = np.flatnonzero(y == self.min_c_) - - # Fit the whole dataset + self._validate_estimator() + + enn = EditedNearestNeighbours(ratio=self.ratio, return_indices=True, + random_state=self.random_state, + size_ngh=self.size_ngh, + n_neighbors=self.n_neighbors, + kind_sel='mode', + n_jobs=self.n_jobs) + _, _, index_not_a1 = enn.fit_sample(X, y) + index_a1 = np.ones(y.shape, dtype=bool) + index_a1[index_not_a1] = False + index_a1 = np.flatnonzero(index_a1) + + # clean the neighborhood + target_stats = Counter(y) + class_minority = min(target_stats, key=target_stats.get) + # compute which classes to consider for cleaning for the A2 group + classes_under_sample = [c for c, n_samples in target_stats.items() + if (c in self.ratio_.keys() and + (n_samples > X.shape[0] * + self.threshold_cleaning))] self.nn_.fit(X) + X_class = X[y == class_minority] + y_class = y[y == class_minority] + nnhood_idx = self.nn_.kneighbors( + X_class, return_distance=False)[:, 1:] + nnhood_label = y[nnhood_idx] + if self.kind_sel == 'mode': + nnhood_label_majority, _ = mode(nnhood_label, axis=1) + nnhood_bool = np.ravel(nnhood_label_majority) == y_class + elif self.kind_sel == 'all': + nnhood_label_majority = nnhood_label == class_minority + nnhood_bool = np.all(nnhood_label, axis=1) + # compute a2 group + index_a2 = np.ravel(nnhood_idx[~nnhood_bool]) + index_a2 = np.unique([index for index in index_a2 + if y[index] in classes_under_sample]) + + union_a1_a2 = np.union1d(index_a1, index_a2).astype(int) + selected_samples = np.ones(y.shape, dtype=bool) + selected_samples[union_a1_a2] = False + index_target_class = np.flatnonzero(selected_samples) - idx_to_exclude = [] - # Loop over the other classes under picking at random - for key in self.stats_c_.keys(): - - # Get the sample of the current class - sub_samples_x = X[y == key] - - # Get the samples associated - idx_sub_sample = np.flatnonzero(y == key) - - # Find the NN for the current class - nnhood_idx = self.nn_.kneighbors( - sub_samples_x, return_distance=False) - - # Get the label of the corresponding to the index - nnhood_label = (y[nnhood_idx] == key) - - # Check which one are the same label than the current class - # Make an AND operation through the three neighbours - nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1)) - - # If the minority class remove the majority samples - if key == self.min_c_: - # Get the index to exclude - idx_to_exclude += nnhood_idx[np.nonzero(np.logical_not( - nnhood_label[np.flatnonzero(nnhood_bool)]))].tolist() - else: - # Get the index to exclude - idx_to_exclude += idx_sub_sample[np.nonzero( - nnhood_bool)].tolist() - - idx_to_exclude = np.unique(idx_to_exclude) - - # Create a vector with the sample to select - sel_idx = np.ones(y.shape) - sel_idx[idx_to_exclude] = 0 - # Exclude as well the minority sample since that they will be - # concatenated later - sel_idx[y == self.min_c_] = 0 - - # Get the samples from the majority classes - sel_x = X[np.flatnonzero(sel_idx), :] - sel_y = y[np.flatnonzero(sel_idx)] - - # If we need to offer support for the indices selected - if self.return_indices: - idx_tmp = np.flatnonzero(sel_idx) - idx_under = np.concatenate((idx_under, idx_tmp), axis=0) - - X_resampled = np.concatenate((X_resampled, sel_x), axis=0) - y_resampled = np.concatenate((y_resampled, sel_y), axis=0) - - self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) - - # Check if the indices of the samples selected should be returned too if self.return_indices: - # Return the indices of interest - return X_resampled, y_resampled, idx_under + return (X[index_target_class], y[index_target_class], + index_target_class) else: - return X_resampled, y_resampled + return X[index_target_class], y[index_target_class] diff --git a/imblearn/under_sampling/prototype_selection/one_sided_selection.py b/imblearn/under_sampling/prototype_selection/one_sided_selection.py index e62c8851d..ae9867c3d 100644 --- a/imblearn/under_sampling/prototype_selection/one_sided_selection.py +++ b/imblearn/under_sampling/prototype_selection/one_sided_selection.py @@ -4,7 +4,7 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function +from __future__ import division from collections import Counter @@ -12,38 +12,57 @@ from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors from sklearn.utils import check_random_state -from ...base import BaseBinarySampler +from ..base import BaseCleaningSampler from .tomek_links import TomekLinks +from ...utils.deprecation import deprecate_parameter -class OneSidedSelection(BaseBinarySampler): +class OneSidedSelection(BaseCleaningSampler): """Class to perform under-sampling based on one-sided selection method. Parameters ---------- + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. size_ngh : int, optional (default=None) Size of the neighbourhood to consider to compute the average distance to the minority point samples. - NOTE: size_ngh is deprecated from 0.2 and will be replaced in 0.4 - Use ``n_neighbors`` instead. + .. deprecated:: 0.2 + ``size_ngh`` is deprecated from 0.2 and will be replaced in 0.4 + Use ``n_neighbors`` instead. - n_neighbors : int or object, optional (default= - KNeighborsClassifier(n_neighbors=1)) - If int, size of the neighbourhood to consider to compute the average - distance to the minority point samples. - If object, an object inherited from - `sklearn.neigbors.KNeighborsClassifier` should be passed. + n_neighbors : int or object, optional (default=\ +KNeighborsClassifier(n_neighbors=1)) + If ``int``, size of the neighbourhood to consider to compute the + average distance to the minority point samples. If object, an object + inherited from :class:`sklearn.neigbors.KNeighborsClassifier` should be + passed. n_seeds_S : int, optional (default=1) Number of samples to extract in order to build the set S. @@ -51,26 +70,11 @@ class OneSidedSelection(BaseBinarySampler): n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- The method is based on [1]_. - This method support multiclass. + Supports mutli-class resampling. Examples -------- @@ -97,13 +101,15 @@ class OneSidedSelection(BaseBinarySampler): """ def __init__(self, + ratio='auto', return_indices=False, random_state=None, size_ngh=None, n_neighbors=None, n_seeds_S=1, n_jobs=1): - super(OneSidedSelection, self).__init__(random_state=random_state) + super(OneSidedSelection, self).__init__(ratio=ratio, + random_state=random_state) self.return_indices = return_indices self.size_ngh = size_ngh self.n_neighbors = n_neighbors @@ -112,7 +118,8 @@ def __init__(self, def _validate_estimator(self): """Private function to create the NN estimator""" - + # FIXME: Deprecated in 0.2. To be removed in 0.4. + deprecate_parameter(self, '0.2', 'size_ngh', 'n_neighbors') if self.n_neighbors is None: self.estimator_ = KNeighborsClassifier( n_neighbors=1, n_jobs=self.n_jobs) @@ -126,30 +133,6 @@ def _validate_estimator(self): ' inhereited from KNeighborsClassifier.' ' Got {} instead.'.format(type(self.n_neighbors))) - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : ndarray, shape (n_samples, ) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - - super(OneSidedSelection, self).fit(X, y) - - self._validate_estimator() - - return self - def _sample(self, X, y): """Resample the dataset. @@ -174,90 +157,74 @@ def _sample(self, X, y): containing the which samples have been selected. """ + self._validate_estimator() random_state = check_random_state(self.random_state) + target_stats = Counter(y) + class_minority = min(target_stats, key=target_stats.get) - # Start with the minority class - X_min = X[y == self.min_c_] - y_min = y[y == self.min_c_] - - # All the minority class samples will be preserved - X_resampled = X_min.copy() - y_resampled = y_min.copy() - - # If we need to offer support for the indices + X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) + y_resampled = np.empty((0, ), dtype=y.dtype) if self.return_indices: - idx_under = np.flatnonzero(y == self.min_c_) - - # Loop over the other classes under picking at random - for key in self.stats_c_.keys(): - - # If the minority class is up, skip it - if key == self.min_c_: - continue - - # Randomly get one sample from the majority class - # Generate the index to select - idx_maj = np.flatnonzero(y == key) - idx_maj_sample = idx_maj[ - random_state.randint( - low=0, - high=self.stats_c_[key], - size=self.n_seeds_S)] - maj_sample = X[idx_maj_sample] - - # Create the set C - C_x = np.append(X_min, maj_sample, axis=0) - C_y = np.append(y_min, [key] * self.n_seeds_S) - - # Create the set S with removing the seed from S - # since that it will be added anyway - idx_maj_extracted = np.delete(idx_maj, idx_maj_sample, axis=0) - S_x = X[idx_maj_extracted] - S_y = y[idx_maj_extracted] - - # Fit C into the knn - self.estimator_.fit(C_x, C_y) - - # Classify on S - pred_S_y = self.estimator_.predict(S_x) - - # Find the misclassified S_y - sel_x = S_x[np.flatnonzero(pred_S_y != S_y), :] - sel_y = S_y[np.flatnonzero(pred_S_y != S_y)] - - # If we need to offer support for the indices selected - # We concatenate the misclassified samples with the seed and the - # minority samples - if self.return_indices: - idx_tmp = idx_maj_extracted[np.flatnonzero(pred_S_y != S_y)] - idx_under = np.concatenate( - (idx_under, idx_maj_sample, idx_tmp), axis=0) - - X_resampled = np.concatenate( - (X_resampled, maj_sample, sel_x), axis=0) - y_resampled = np.concatenate( - (y_resampled, [key] * self.n_seeds_S, sel_y), axis=0) - - # Find the nearest neighbour of every point + idx_under = np.empty((0, ), dtype=int) + + for target_class in np.unique(y): + if target_class in self.ratio_.keys(): + # select a sample from the current class + idx_maj = np.flatnonzero(y == target_class) + idx_maj_sample = idx_maj[random_state.randint( + low=0, high=target_stats[target_class], + size=self.n_seeds_S)] + maj_sample = X[idx_maj_sample] + + # create the set composed of all minority samples and one + # sample from the current class. + C_x = np.append(X[y == class_minority], maj_sample, axis=0) + C_y = np.append(y[y == class_minority], [target_class] * + self.n_seeds_S) + + # create the set S with removing the seed from S + # since that it will be added anyway + idx_maj_extracted = np.delete(idx_maj, idx_maj_sample, axis=0) + S_x = X[idx_maj_extracted] + S_y = y[idx_maj_extracted] + self.estimator_.fit(C_x, C_y) + pred_S_y = self.estimator_.predict(S_x) + + sel_x = S_x[np.flatnonzero(pred_S_y != S_y), :] + sel_y = S_y[np.flatnonzero(pred_S_y != S_y)] + if self.return_indices: + idx_tmp = idx_maj_extracted[ + np.flatnonzero(pred_S_y != S_y)] + idx_under = np.concatenate( + (idx_under, idx_maj_sample, idx_tmp), axis=0) + X_resampled = np.concatenate( + (X_resampled, maj_sample, sel_x), axis=0) + y_resampled = np.concatenate( + (y_resampled, [target_class] * self.n_seeds_S, sel_y), + axis=0) + else: + X_resampled = np.concatenate( + (X_resampled, X[y == target_class]), axis=0) + y_resampled = np.concatenate( + (y_resampled, y[y == target_class]), axis=0) + if self.return_indices: + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)), axis=0) + + # find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs) nn.fit(X_resampled) nns = nn.kneighbors(X_resampled, return_distance=False)[:, 1] - # Send the information to is_tomek function to get boolean vector back - self.logger.debug('Looking for majority Tomek links ...') - links = TomekLinks.is_tomek(y_resampled, nns, self.min_c_) - - self.logger.info('Under-sampling performed: %s', - Counter(y_resampled[np.logical_not(links)])) - - # Check if the indices of the samples selected should be returned too + links = TomekLinks.is_tomek(y_resampled, nns, + [c for c in np.unique(y) + if (c != class_minority and + c in self.ratio_.keys())]) if self.return_indices: - # Return the indices of interest return (X_resampled[np.logical_not(links)], y_resampled[np.logical_not(links)], idx_under[np.logical_not(links)]) else: - # Return data set without majority Tomek links. return (X_resampled[np.logical_not(links)], y_resampled[np.logical_not(links)]) diff --git a/imblearn/under_sampling/prototype_selection/random_under_sampler.py b/imblearn/under_sampling/prototype_selection/random_under_sampler.py index e634c2064..9817d4db0 100644 --- a/imblearn/under_sampling/prototype_selection/random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/random_under_sampler.py @@ -4,17 +4,15 @@ # Christos Aridas # License: MIT -from __future__ import division, print_function - -from collections import Counter +from __future__ import division import numpy as np from sklearn.utils import check_random_state -from ...base import BaseMulticlassSampler +from ..base import BaseUnderSampler -class RandomUnderSampler(BaseMulticlassSampler): +class RandomUnderSampler(BaseUnderSampler): """Class to perform random under-sampling. Under-sample the majority class(es) by randomly picking samples @@ -22,43 +20,39 @@ class RandomUnderSampler(BaseMulticlassSampler): Parameters ---------- - ratio : str or float, optional (default='auto') - If 'auto', the ratio will be defined automatically to balance - the dataset. Otherwise, the ratio is defined as the number - of samples in the minority class over the the number of samples - in the majority class. + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. - replacement : boolean, optional (default=True) + replacement : boolean, optional (default=False) Whether the sample is with (default) or without replacement. - - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary containing the number of occurences of each class. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- - This class supports multi-class. + Supports mutli-class resampling. Examples -------- @@ -66,7 +60,7 @@ class RandomUnderSampler(BaseMulticlassSampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ - RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE +RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -83,7 +77,7 @@ def __init__(self, ratio='auto', return_indices=False, random_state=None, - replacement=True): + replacement=False): super(RandomUnderSampler, self).__init__( ratio=ratio, random_state=random_state) self.return_indices = return_indices @@ -114,52 +108,35 @@ def _sample(self, X, y): that sample was selected or not. """ - random_state = check_random_state(self.random_state) - # Compute the number of clusters needed - if self.ratio == 'auto': - num_samples = self.stats_c_[self.min_c_] - else: - num_samples = int(self.stats_c_[self.min_c_] / self.ratio) - - # All the minority class samples will be preserved - X_resampled = X[y == self.min_c_] - y_resampled = y[y == self.min_c_] - - # If we need to offer support for the indices + X_resampled = np.empty((0, X.shape[1]), dtype=X.dtype) + y_resampled = np.empty((0, ), dtype=y.dtype) if self.return_indices: - idx_under = np.nonzero(y == self.min_c_)[0] - - # Loop over the other classes under-picking at random - for key in self.stats_c_.keys(): - - # If the minority class is up, skip it - if key == self.min_c_: - continue - - # Pick some elements at random - indx = range(np.count_nonzero(y == key)) - indx = random_state.choice( - indx, size=num_samples, replace=self.replacement) + idx_under = np.empty((0, ), dtype=int) + + for target_class in np.unique(y): + if target_class in self.ratio_.keys(): + n_samples = self.ratio_[target_class] + index_target_class = random_state.choice( + range(np.count_nonzero(y == target_class)), + size=n_samples, + replace=self.replacement) + else: + index_target_class = slice(None) - # If we need to offer support for the indices selected - if self.return_indices: - idx_tmp = np.nonzero(y == key)[0][indx] - idx_under = np.concatenate((idx_under, idx_tmp), axis=0) - - # Concatenate to the minority class X_resampled = np.concatenate( - (X_resampled, X[y == key][indx]), axis=0) + (X_resampled, X[y == target_class][index_target_class]), + axis=0) y_resampled = np.concatenate( - (y_resampled, y[y == key][indx]), axis=0) - - self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) + (y_resampled, y[y == target_class][index_target_class]), + axis=0) + if self.return_indices: + idx_under = np.concatenate( + (idx_under, np.flatnonzero(y == target_class)[ + index_target_class]), axis=0) - # Check if the indices of the samples selected should be returned as - # well if self.return_indices: - # Return the indices of interest return X_resampled, y_resampled, idx_under else: return X_resampled, y_resampled diff --git a/imblearn/under_sampling/prototype_selection/tests/test_allknn.py b/imblearn/under_sampling/prototype_selection/tests/test_allknn.py index 8584bae64..2b15c6bbf 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_allknn.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_allknn.py @@ -1,14 +1,17 @@ """Test the module repeated edited nearest neighbour.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np from numpy.testing import (assert_allclose, assert_array_equal, - assert_equal, assert_raises) + assert_raises) from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import AllKNN -# Generate a global dataset to use RND_SEED = 0 X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.83631853, 0.18569783], @@ -37,18 +40,7 @@ R_TOL = 1e-4 -def test_allknn_init(): - # Define a ratio - allknn = AllKNN(random_state=RND_SEED) - - assert_equal(allknn.n_neighbors, 3) - assert_equal(allknn.kind_sel, 'all') - assert_equal(allknn.n_jobs, -1) - assert_equal(allknn.random_state, RND_SEED) - - def test_allknn_fit_sample(): - # Resample the data allknn = AllKNN(random_state=RND_SEED) X_resampled, y_resampled = allknn.fit_sample(X, Y) @@ -75,7 +67,6 @@ def test_allknn_fit_sample(): def test_allknn_fit_sample_with_indices(): - # Resample the data allknn = AllKNN(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = allknn.fit_sample(X, Y) @@ -107,7 +98,6 @@ def test_allknn_fit_sample_with_indices(): def test_allknn_fit_sample_mode(): - # Resample the data allknn = AllKNN(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) @@ -136,7 +126,6 @@ def test_allknn_fit_sample_mode(): def test_allknn_fit_sample_with_nn_object(): - # Resample the data nn = NearestNeighbors(n_neighbors=4) allknn = AllKNN(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = allknn.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py index aa97d4e05..9f274b808 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -1,4 +1,8 @@ """Test the module condensed nearest neighbour.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -8,7 +12,6 @@ from imblearn.under_sampling import CondensedNearestNeighbour -# Generate a global dataset to use RND_SEED = 0 X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], [1.42772181, 0.526027], [1.92365863, 0.82718767], @@ -24,7 +27,6 @@ def test_cnn_init(): - # Define a ratio cnn = CondensedNearestNeighbour(random_state=RND_SEED) assert_equal(cnn.n_seeds_S, 1) @@ -32,7 +34,6 @@ def test_cnn_init(): def test_cnn_fit_sample(): - # Resample the data cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) @@ -47,7 +48,6 @@ def test_cnn_fit_sample(): def test_cnn_fit_sample_with_indices(): - # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) @@ -64,7 +64,6 @@ def test_cnn_fit_sample_with_indices(): def test_cnn_fit_sample_with_object(): - # Resample the data knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_sample(X, Y) @@ -85,7 +84,6 @@ def test_cnn_fit_sample_with_object(): def test_cnn_fit_sample_with_wrong_object(): - # Resample the data knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) assert_raises_regex(ValueError, "has to be a int or an ", diff --git a/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py index 6556d75b9..c926473f1 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_edited_nearest_neighbours.py @@ -1,4 +1,8 @@ """Test the module edited nearest neighbour.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -9,7 +13,6 @@ from imblearn.under_sampling import EditedNearestNeighbours -# Generate a global dataset to use RND_SEED = 0 X = np.array([[2.59928271, 0.93323465], [0.25738379, 0.95564169], [1.42772181, 0.526027], [1.92365863, 0.82718767], @@ -25,7 +28,6 @@ def test_enn_init(): - # Define a ratio enn = EditedNearestNeighbours(random_state=RND_SEED) assert_equal(enn.n_neighbors, 3) @@ -35,7 +37,6 @@ def test_enn_init(): def test_enn_fit_sample(): - # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = enn.fit_sample(X, Y) @@ -49,7 +50,6 @@ def test_enn_fit_sample(): def test_enn_fit_sample_with_indices(): - # Resample the data enn = EditedNearestNeighbours(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = enn.fit_sample(X, Y) @@ -65,7 +65,6 @@ def test_enn_fit_sample_with_indices(): def test_enn_fit_sample_mode(): - # Resample the data enn = EditedNearestNeighbours(random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = enn.fit_sample(X, Y) @@ -82,7 +81,6 @@ def test_enn_fit_sample_mode(): def test_enn_fit_sample_with_nn_object(): - # Resample the data nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') @@ -101,7 +99,6 @@ def test_enn_fit_sample_with_nn_object(): def test_enn_not_good_object(): - # Resample the data nn = 'rnd' enn = EditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') diff --git a/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py index 728761f13..18ebeb7cb 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_instance_hardness_threshold.py @@ -1,4 +1,8 @@ """Test the module .""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -8,7 +12,6 @@ from imblearn.under_sampling import InstanceHardnessThreshold -# Generate a global dataset to use RND_SEED = 0 X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], @@ -23,7 +26,6 @@ def test_iht_wrong_estimator(): - # Resample the data ratio = 0.7 est = 'rnd' iht = InstanceHardnessThreshold( @@ -32,7 +34,6 @@ def test_iht_wrong_estimator(): def test_iht_init(): - # Define a ratio ratio = 'auto' iht = InstanceHardnessThreshold( ESTIMATOR, ratio=ratio, random_state=RND_SEED) @@ -42,181 +43,231 @@ def test_iht_init(): def test_iht_fit_sample(): - # Resample the data iht = InstanceHardnessThreshold(ESTIMATOR, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_with_indices(): - # Resample the data iht = InstanceHardnessThreshold( ESTIMATOR, return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) - idx_gt = np.array([0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) + idx_gt = np.array([0, 3, 9, 12, 13, 14, 1, 2, 5, 6, 7, 10]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt) def test_iht_fit_sample_half(): - # Resample the data ratio = 0.7 iht = InstanceHardnessThreshold( ESTIMATOR, ratio=ratio, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], - [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.03852113, 0.40910479], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.30126957, -0.66268378], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_knn(): - # Resample the data est = 'knn' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.30126957, -0.66268378], [-0.65571327, 0.42412021], - [0.20246714, -0.34727125], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.30126957, -0.66268378], + [0.20246714, -0.34727125]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_decision_tree(): - # Resample the data est = 'decision-tree' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_random_forest(): - # Resample the data est = 'random-forest' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], - [-0.85795321, 0.82980738], [-0.18430329, 0.52328473], - [-0.65571327, 0.42412021], [-0.28305528, 0.30284991], - [1.06446472, -1.09279772], [0.30543283, -0.02589502], - [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.03852113, 0.40910479], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_adaboost(): - # Resample the data est = 'adaboost' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_gradient_boosting(): - # Resample the data est = 'gradient-boosting' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_linear_svm(): - # Resample the data est = 'linear-svm' iht = InstanceHardnessThreshold(est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.03852113, 0.40910479], [-0.43877303, 1.07366684], - [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.03852113, 0.40910479], + [-0.43877303, 1.07366684], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_class_obj(): - # Resample the data est = GradientBoostingClassifier(random_state=RND_SEED) iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) X_resampled, y_resampled = iht.fit_sample(X, Y) - X_gt = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], - [-0.77740357, 0.74097941], [0.91542919, -0.65453327], - [-0.43877303, 1.07366684], [-0.85795321, 0.82980738], - [-0.18430329, 0.52328473], [-0.65571327, 0.42412021], - [-0.28305528, 0.30284991], [1.06446472, -1.09279772], - [0.30543283, -0.02589502], [-0.00717161, 0.00318087]]) - y_gt = np.array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]) + X_gt = np.array([[-0.3879569, 0.6894251], + [0.91542919, -0.65453327], + [-0.65571327, 0.42412021], + [1.06446472, -1.09279772], + [0.30543283, -0.02589502], + [-0.00717161, 0.00318087], + [-0.09322739, 1.28177189], + [-0.77740357, 0.74097941], + [-0.43877303, 1.07366684], + [-0.85795321, 0.82980738], + [-0.18430329, 0.52328473], + [-0.28305528, 0.30284991]]) + y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_iht_fit_sample_wrong_class_obj(): - # Resample the data from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py index 9baa4fa66..256f2995e 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_nearmiss.py @@ -1,4 +1,8 @@ """Test the module nearmiss.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -8,7 +12,6 @@ from imblearn.under_sampling import NearMiss -# Generate a global dataset to use for the 3 version of nearmiss RND_SEED = 0 X = np.array([[1.17737838, -0.2002118], [0.4960075, 0.86130762], @@ -33,7 +36,7 @@ # FIXME remove at the end of the deprecation 0.4 def test_nearmiss_deprecation(): nm = NearMiss(ver3_samp_ngh=3, version=3) - assert_warns(DeprecationWarning, nm.fit, X, Y) + assert_warns(DeprecationWarning, nm.fit_sample, X, Y) def test_nearmiss_wrong_version(): @@ -52,8 +55,6 @@ def test_nm_wrong_nn_obj(): n_neighbors=nn) assert_raises_regex(ValueError, "has to be one of", nm.fit_sample, X, Y) - - # Create the object nn3 = 'rnd' nn = NearestNeighbors(n_neighbors=3) nm3 = NearMiss(ratio=ratio, random_state=RND_SEED, @@ -92,11 +93,9 @@ def test_nm_fit_sample_auto(): [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])] - for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(ratio=ratio, random_state=RND_SEED, version=version) @@ -134,15 +133,12 @@ def test_nm_fit_sample_auto_indices(): [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])] - idx_gt = [np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), np.array([3, 10, 11, 2, 8, 5, 9, 1, 6]), np.array([3, 10, 11, 0, 5, 8, 14, 4, 12])] - for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(ratio=ratio, random_state=RND_SEED, version=version, return_indices=True) @@ -154,7 +150,6 @@ def test_nm_fit_sample_auto_indices(): def test_nm_fit_sample_float_ratio(): ratio = .7 - X_gt = [np.array([[0.91464286, 1.61369212], [-0.80809175, -1.09917302], [-0.20497017, -0.26630228], @@ -188,7 +183,6 @@ def test_nm_fit_sample_float_ratio(): [-0.54619583, 1.73009918], [0.99272351, -0.11631728], [0.45713638, 1.31069295]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])] @@ -231,11 +225,9 @@ def test_nm_fit_sample_nn_obj(): [1.15157493, -1.2981518], [-0.54619583, 1.73009918], [0.99272351, -0.11631728]])] - y_gt = [np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]), np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])] - for version_idx, version in enumerate(VERSION_NEARMISS): nm = NearMiss(ratio=ratio, random_state=RND_SEED, version=version, n_neighbors=nn) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py index f6c43a1a4..bc899f310 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -1,14 +1,17 @@ """Test the module neighbourhood cleaning rule.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np -from numpy.testing import assert_array_equal, assert_equal, assert_raises_regex +from numpy.testing import assert_array_equal, assert_raises_regex from sklearn.neighbors import NearestNeighbors from imblearn.under_sampling import NeighbourhoodCleaningRule -# Generate a global dataset to use RND_SEED = 0 X = np.array([[1.57737838, 0.1997882], [0.8960075, 0.46130762], [0.34096173, 0.50947647], [-0.91735824, 0.93110278], @@ -21,65 +24,103 @@ Y = np.array([1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2]) -def test_ncr_init(): - # Define a ratio - ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) - - assert_equal(ncr.n_neighbors, 3) - assert_equal(ncr.n_jobs, 1) - assert_equal(ncr.random_state, RND_SEED) +def test_ncr_error(): + threshold_cleaning = -10 + assert_raises_regex(ValueError, "'threshold_cleaning' is a value between" + " 0 and 1.", NeighbourhoodCleaningRule( + threshold_cleaning=threshold_cleaning).fit_sample, + X, Y) + threshold_cleaning = 10 + assert_raises_regex(ValueError, "'threshold_cleaning' is a value between" + " 0 and 1.", NeighbourhoodCleaningRule( + threshold_cleaning=threshold_cleaning).fit_sample, + X, Y) def test_ncr_fit_sample(): - # Resample the data ncr = NeighbourhoodCleaningRule(random_state=RND_SEED) X_resampled, y_resampled = ncr.fit_sample(X, Y) - X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], + X_gt = np.array([[0.34096173, 0.50947647], + [-0.91735824, 0.93110278], + [-0.20413357, 0.64628718], + [0.35967591, 2.61186964], + [0.90701028, -0.57636928], + [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], + [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], + [1.55157493, -1.6981518]]) + y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) + assert_array_equal(X_resampled, X_gt) + assert_array_equal(y_resampled, y_gt) + + +def test_ncr_fit_sample_mode(): + ncr = NeighbourhoodCleaningRule(random_state=RND_SEED, + kind_sel='mode') + X_resampled, y_resampled = ncr.fit_sample(X, Y) + + X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], - [0.35967591, 2.61186964], [-1.55581933, 1.09609604], + [-0.20413357, 0.64628718], + [0.35967591, 2.61186964], + [0.90701028, -0.57636928], + [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], + [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) - y_gt = np.array([0, 0, 1, 2, 1, 2]) + y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_ncr_fit_sample_with_indices(): - # Resample the data ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) - X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], + X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], - [0.35967591, 2.61186964], [-1.55581933, 1.09609604], + [-0.20413357, 0.64628718], + [0.35967591, 2.61186964], + [0.90701028, -0.57636928], + [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], + [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) - y_gt = np.array([0, 0, 1, 2, 1, 2]) - idx_gt = np.array([10, 11, 3, 7, 13, 14]) + y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) + idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt) def test_ncr_fit_sample_nn_obj(): - # Resample the data - nn = NearestNeighbors(n_neighbors=3) + nn = NearestNeighbors(n_neighbors=4) ncr = NeighbourhoodCleaningRule( return_indices=True, random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled, idx_under = ncr.fit_sample(X, Y) - X_gt = np.array([[-1.20809175, -1.49917302], [-0.60497017, -0.66630228], + X_gt = np.array([[0.34096173, 0.50947647], [-0.91735824, 0.93110278], - [0.35967591, 2.61186964], [-1.55581933, 1.09609604], + [-0.20413357, 0.64628718], + [0.35967591, 2.61186964], + [0.90701028, -0.57636928], + [-1.20809175, -1.49917302], + [-0.60497017, -0.66630228], + [1.39272351, -0.51631728], + [-1.55581933, 1.09609604], [1.55157493, -1.6981518]]) - y_gt = np.array([0, 0, 1, 2, 1, 2]) - idx_gt = np.array([10, 11, 3, 7, 13, 14]) + y_gt = np.array([1, 1, 1, 2, 2, 0, 0, 2, 1, 2]) + idx_gt = np.array([2, 3, 5, 7, 9, 10, 11, 12, 13, 14]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt) def test_ncr_wrong_nn_obj(): - # Resample the data nn = 'rnd' ncr = NeighbourhoodCleaningRule( return_indices=True, random_state=RND_SEED, n_neighbors=nn) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py index 7d79586bd..2cf7ab903 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_one_sided_selection.py @@ -1,4 +1,8 @@ """Test the module one-sided selection.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -8,7 +12,6 @@ from imblearn.under_sampling import OneSidedSelection -# Generate a global dataset to use RND_SEED = 0 X = np.array([[-0.3879569, 0.6894251], [-0.09322739, 1.28177189], [-0.77740357, 0.74097941], [0.91542919, -0.65453327], @@ -22,7 +25,6 @@ def test_oss_init(): - # Define a ratio oss = OneSidedSelection(random_state=RND_SEED) assert_equal(oss.n_seeds_S, 1) @@ -31,7 +33,6 @@ def test_oss_init(): def test_oss_fit_sample(): - # Resample the data oss = OneSidedSelection(random_state=RND_SEED) X_resampled, y_resampled = oss.fit_sample(X, Y) @@ -47,7 +48,6 @@ def test_oss_fit_sample(): def test_oss_fit_sample_with_indices(): - # Resample the data oss = OneSidedSelection(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = oss.fit_sample(X, Y) @@ -65,7 +65,6 @@ def test_oss_fit_sample_with_indices(): def test_oss_with_object(): - # Resample the data knn = KNeighborsClassifier(n_neighbors=1) oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) @@ -79,7 +78,6 @@ def test_oss_with_object(): y_gt = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) - # Resample the data knn = 1 oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = oss.fit_sample(X, Y) @@ -88,7 +86,6 @@ def test_oss_with_object(): def test_oss_with_wrong_object(): - # Resample the data knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) assert_raises_regex(ValueError, "has to be a int", diff --git a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py index a2d3c6022..76f5becbe 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_random_under_sampler.py @@ -1,4 +1,8 @@ """Test the module random under sampler.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function from collections import Counter @@ -8,9 +12,7 @@ from imblearn.under_sampling import RandomUnderSampler -# Generate a global dataset to use RND_SEED = 0 -# Data generated for the toy example X = np.array([[0.04352327, -0.20515826], [0.92923648, 0.76103773], [0.20792588, 1.49407907], [0.47104475, 0.44386323], [0.22950086, 0.33367433], [0.15490546, 0.3130677], @@ -19,30 +21,23 @@ Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1]) -def test_rus_init(): - # Define a ratio - ratio = 'auto' - rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED) - - assert_equal(rus.random_state, RND_SEED) - - def test_rus_fit_sample(): - # Resample the data - rus = RandomUnderSampler(random_state=RND_SEED) + rus = RandomUnderSampler(random_state=RND_SEED, + replacement=True) X_resampled, y_resampled = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], [0.13347175, 0.12167502], [0.09125309, -0.85409574], [0.12372842, 0.6536186], [0.04352327, -0.20515826]]) y_gt = np.array([0, 0, 0, 1, 1, 1]) + assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) def test_rus_fit_sample_with_indices(): - # Resample the data - rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED) + rus = RandomUnderSampler(return_indices=True, random_state=RND_SEED, + replacement=True) X_resampled, y_resampled, idx_under = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], @@ -56,9 +51,9 @@ def test_rus_fit_sample_with_indices(): def test_rus_fit_sample_half(): - # Resample the data ratio = 0.5 - rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED) + rus = RandomUnderSampler(ratio=ratio, random_state=RND_SEED, + replacement=True) X_resampled, y_resampled = rus.fit_sample(X, Y) X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323], @@ -72,16 +67,11 @@ def test_rus_fit_sample_half(): def test_multiclass_fit_sample(): - # Make y to be multiclass y = Y.copy() y[5] = 2 y[6] = 2 - - # Resample the data rus = RandomUnderSampler(random_state=RND_SEED) X_resampled, y_resampled = rus.fit_sample(X, y) - - # Check the size of y count_y_res = Counter(y_resampled) assert_equal(count_y_res[0], 2) assert_equal(count_y_res[1], 2) diff --git a/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py index efb50b9e5..390034dbb 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_repeated_edited_nearest_neighbours.py @@ -1,4 +1,8 @@ """Test the module repeated edited nearest neighbour.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -8,7 +12,6 @@ from imblearn.under_sampling import RepeatedEditedNearestNeighbours -# Generate a global dataset to use RND_SEED = 0 X = np.array([[-0.12840393, 0.66446571], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.83631853, 0.18569783], @@ -37,7 +40,6 @@ def test_renn_init(): - # Define a ratio renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) assert_equal(renn.n_neighbors, 3) @@ -47,7 +49,6 @@ def test_renn_init(): def test_renn_iter_wrong(): - # Create the object max_iter = -1 renn = RepeatedEditedNearestNeighbours( max_iter=max_iter, random_state=RND_SEED) @@ -55,7 +56,6 @@ def test_renn_iter_wrong(): def test_renn_fit_sample(): - # Resample the data renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = renn.fit_sample(X, Y) @@ -81,7 +81,6 @@ def test_renn_fit_sample(): def test_renn_fit_sample_with_indices(): - # Resample the data renn = RepeatedEditedNearestNeighbours( return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y) @@ -113,7 +112,6 @@ def test_renn_fit_sample_with_indices(): def test_renn_fit_sample_mode_object(): - # Resample the data renn = RepeatedEditedNearestNeighbours( random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = renn.fit_sample(X, Y) @@ -144,7 +142,6 @@ def test_renn_fit_sample_mode_object(): def test_renn_fit_sample_mode(): - # Resample the data nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') @@ -176,7 +173,6 @@ def test_renn_fit_sample_mode(): def test_renn_not_good_object(): - # Resample the data nn = 'rnd' renn = RepeatedEditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') diff --git a/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py index 565ec4b33..fbf76f10a 100644 --- a/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py +++ b/imblearn/under_sampling/prototype_selection/tests/test_tomek_links.py @@ -1,4 +1,8 @@ """Test the module Tomek's links.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from __future__ import print_function import numpy as np @@ -6,7 +10,6 @@ from imblearn.under_sampling import TomekLinks -# Generate a global dataset to use RND_SEED = 0 X = np.array([[0.31230513, 0.1216318], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], @@ -22,7 +25,6 @@ def test_tl_init(): - # Define a ratio tl = TomekLinks(random_state=RND_SEED) assert_equal(tl.n_jobs, 1) @@ -30,7 +32,6 @@ def test_tl_init(): def test_tl_fit_sample(): - # Resample the data tl = TomekLinks(random_state=RND_SEED) X_resampled, y_resampled = tl.fit_sample(X, Y) @@ -49,7 +50,6 @@ def test_tl_fit_sample(): def test_tl_fit_sample_with_indices(): - # Resample the data tl = TomekLinks(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = tl.fit_sample(X, Y) diff --git a/imblearn/under_sampling/prototype_selection/tomek_links.py b/imblearn/under_sampling/prototype_selection/tomek_links.py index 276efd812..75133ec4c 100644 --- a/imblearn/under_sampling/prototype_selection/tomek_links.py +++ b/imblearn/under_sampling/prototype_selection/tomek_links.py @@ -7,52 +7,52 @@ from __future__ import division, print_function -from collections import Counter - import numpy as np from sklearn.neighbors import NearestNeighbors -from ...base import BaseBinarySampler +from ..base import BaseCleaningSampler -class TomekLinks(BaseBinarySampler): +class TomekLinks(BaseCleaningSampler): """Class to perform under-sampling by removing Tomek's links. Parameters ---------- + ratio : str, dict, or callable, optional (default='auto') + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + return_indices : bool, optional (default=False) Whether or not to return the indices of the samples randomly selected from the majority class. random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by np.random. + If int, ``random_state`` is the seed used by the random number + generator; If ``RandomState`` instance, random_state is the random + number generator; If ``None``, the random number generator is the + ``RandomState`` instance used by ``np.random``. n_jobs : int, optional (default=1) The number of threads to open if possible. - Attributes - ---------- - min_c_ : str or int - The identifier of the minority class. - - max_c_ : str or int - The identifier of the majority class. - - stats_c_ : dict of str/int : int - A dictionary in which the number of occurences of each class is - reported. - - X_shape_ : tuple of int - Shape of the data `X` during fitting. - Notes ----- This method is based on [1]_. - It does not support multi-class sampling. + Supports mutli-class resampling. Examples -------- @@ -60,7 +60,7 @@ class TomekLinks(BaseBinarySampler): >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ - TomekLinks # doctest: +NORMALIZE_WHITESPACE +TomekLinks # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) @@ -78,8 +78,10 @@ class TomekLinks(BaseBinarySampler): """ - def __init__(self, return_indices=False, random_state=None, n_jobs=1): - super(TomekLinks, self).__init__(random_state=random_state) + def __init__(self, ratio='auto', return_indices=False, + random_state=None, n_jobs=1): + super(TomekLinks, self).__init__(ratio=ratio, + random_state=random_state) self.return_indices = return_indices self.n_jobs = n_jobs @@ -108,25 +110,20 @@ def is_tomek(y, nn_index, class_type): that are Tomek links. """ - - # Initialize the boolean result as false. links = np.zeros(len(y), dtype=bool) - # Loop through each sample and looks whether it belongs to the minority - # class. If it does, we don't consider it since we want to keep all - # minority samples. If, however, it belongs to the majority sample we - # look at its first neighbour. If its closest neighbour also has the - # current sample as its closest neighbour, the two form a Tomek link. - for ind, ele in enumerate(y): + # find which class to not consider + class_excluded = [c for c in np.unique(y) if c not in class_type] - if ele == class_type: + # there is a Tomek link between two samples if they are both nearest + # neighbors of each others. + for index_sample, target_sample in enumerate(y): + if target_sample in class_excluded: continue - if y[nn_index[ind]] == class_type: - - # If they form a tomek link, put a True marker on this sample. - if nn_index[nn_index[ind]] == ind: - links[ind] = True + if y[nn_index[index_sample]] != target_sample: + if nn_index[nn_index[index_sample]] == index_sample: + links[index_sample] = True return links @@ -160,18 +157,10 @@ def _sample(self, X, y): nn.fit(X) nns = nn.kneighbors(X, return_distance=False)[:, 1] - # Send the information to is_tomek function to get boolean vector back - self.logger.debug('Looking for majority Tomek links ...') - links = self.is_tomek(y, nns, self.min_c_) - - self.logger.info('Under-sampling performed: %s', - Counter(y[np.logical_not(links)])) + links = self.is_tomek(y, nns, self.ratio_) - # Check if the indices of the samples selected should be returned too if self.return_indices: - # Return the indices of interest return (X[np.logical_not(links)], y[np.logical_not(links)], np.flatnonzero(np.logical_not(links))) else: - # Return data set without majority Tomek links. return X[np.logical_not(links)], y[np.logical_not(links)] diff --git a/imblearn/utils/__init__.py b/imblearn/utils/__init__.py index d9fab5e5e..a767c20aa 100644 --- a/imblearn/utils/__init__.py +++ b/imblearn/utils/__init__.py @@ -3,6 +3,12 @@ """ from .validation import check_neighbors_object +from .validation import check_target_type +from .validation import hash_X_y +from .validation import check_ratio -__all__ = ['check_neighbors_object'] +__all__ = ['check_neighbors_object', + 'check_target_type', + 'hash_X_y', + 'check_ratio'] diff --git a/imblearn/utils/deprecation.py b/imblearn/utils/deprecation.py new file mode 100644 index 000000000..18a782f89 --- /dev/null +++ b/imblearn/utils/deprecation.py @@ -0,0 +1,51 @@ +"""Utilities for deprecation""" + +# Authors: Guillaume Lemaitre +# License: MIT + +import warnings + + +def deprecate_parameter(sampler, version_deprecation, param_deprecated, + new_param=None): + """Helper to deprecate a parameter by another one. + + Parameters + ---------- + sampler : object, + The object which will be inspected. + + version_deprecation : str, + The version from which the parameter will be deprecated. The format + should be ``'x.y'`` + + param_deprecated : str, + The parameter being deprecated. + + new_param : str, + The parameter used instead of the deprecated parameter. By default, no + parameter is expected. + + Returns + ------- + None + + """ + warnings.simplefilter("always", DeprecationWarning) + if new_param is None: + if getattr(sampler, param_deprecated) is not None: + warnings.warn("'{}' is deprecated from {} and will be removed in" + " {}.".format(param_deprecated, + version_deprecation, + str(float(version_deprecation) + 0.2)), + category=DeprecationWarning) + else: + if getattr(sampler, param_deprecated) is not None: + warnings.warn("'{}' is deprecated from {} and will be removed in" + " {}. Use '{}' instead.".format( + param_deprecated, + version_deprecation, + str(float(version_deprecation) + 0.2), + new_param), + category=DeprecationWarning) + setattr(sampler, new_param, getattr(sampler, param_deprecated)) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 76f30b4a4..cbc223f13 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -8,35 +8,34 @@ import sys import traceback + +from collections import Counter + import numpy as np +from sklearn.datasets import make_classification from sklearn.utils.estimator_checks import _yield_all_checks \ as sklearn_yield_all_checks, check_estimator \ as sklearn_check_estimator, check_parameters_default_constructible from sklearn.exceptions import NotFittedError from sklearn.utils.testing import (assert_warns, assert_raises_regex, - assert_equal, assert_true, - set_random_state) + assert_true, set_random_state, + assert_equal) from imblearn.base import SamplerMixin -from imblearn.utils.testing import binary_estimators, multiclass_estimators +from imblearn.over_sampling.base import BaseOverSampler +from imblearn.under_sampling.base import BaseCleaningSampler, BaseUnderSampler +from imblearn.ensemble.base import BaseEnsembleSampler def _yield_sampler_checks(name, Estimator): - # Get only the name of binary and multiclass samplers - binary_samplers = tuple([c[0] for c in binary_estimators()]) - multiclass_samplers = tuple([c[0] for c in multiclass_estimators()]) - if name in binary_samplers: - yield check_continuous_warning - yield check_multiclass_warning - if name in multiclass_samplers: - yield check_continuous_warning - if 'ratio' in Estimator().get_params().keys(): - yield check_samplers_ratio_error + yield check_target_type yield check_samplers_one_label yield check_samplers_no_fit_error yield check_samplers_X_consistancy_sample yield check_samplers_fit + yield check_samplers_fit_sample + yield check_samplers_ratio_fit_sample def _yield_all_checks(name, Estimator): @@ -72,7 +71,7 @@ def check_estimator(Estimator): check(name, Estimator) -def check_continuous_warning(name, Estimator): +def check_target_type(name, Estimator): X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) estimator = Estimator() @@ -177,50 +176,80 @@ def check_samplers_no_fit_error(name, Sampler): sampler.sample, X, y) -def check_samplers_ratio_error(name, Sampler): - sampler = Sampler() - X = np.random.random((20, 2)) - y = np.array([1] * 5 + [0] * 15) - - ratio = 1000 - sampler.set_params(**{'ratio': ratio}) - assert_raises_regex(ValueError, "Ratio cannot be greater than one.", - sampler.fit, X, y) - ratio = -1.0 - sampler.set_params(**{'ratio': ratio}) - assert_raises_regex(ValueError, "Ratio cannot be negative.", - sampler.fit, X, y) - ratio = 'rnd' - sampler.set_params(**{'ratio': ratio}) - assert_raises_regex(ValueError, "Unknown string for the parameter ratio.", - sampler.fit, X, y) - ratio = [.5, .5] - sampler.set_params(**{'ratio': ratio}) - assert_raises_regex(ValueError, "Unknown parameter type for ratio.", - sampler.fit, X, y) - ratio = 1 / 1000 - sampler.set_params(**{'ratio': ratio}) - assert_raises_regex(RuntimeError, "The ratio requested at initialisation", - sampler.fit, X, y) - - def check_samplers_X_consistancy_sample(name, Sampler): sampler = Sampler() - X = np.random.random((20, 2)) - y = np.array([1] * 15 + [0] * 5) + X = np.random.random((30, 2)) + y = np.array([1] * 20 + [0] * 10) sampler.fit(X, y) - X_different = np.random.random((30, 2)) - y_different = y = np.array([1] * 15 + [0] * 15) - assert_raises_regex(RuntimeError, "to be the one earlier fitted", + X_different = np.random.random((40, 2)) + y_different = y = np.array([1] * 25 + [0] * 15) + assert_raises_regex(RuntimeError, "X and y need to be same array earlier", sampler.sample, X_different, y_different) def check_samplers_fit(name, Sampler): sampler = Sampler() - X = np.random.random((20, 2)) - y = np.array([1] * 15 + [0] * 5) + X = np.random.random((30, 2)) + y = np.array([1] * 20 + [0] * 10) sampler.fit(X, y) - assert_equal(sampler.min_c_, 0) - assert_equal(sampler.maj_c_, 1) - assert_equal(sampler.stats_c_[0], 5) - assert_equal(sampler.stats_c_[1], 15) + assert_true(hasattr(sampler, 'ratio_')) + + +def check_samplers_fit_sample(name, Sampler): + sampler = Sampler(random_state=0) + X, y = make_classification(n_samples=1000, n_classes=3, + n_informative=4, weights=[0.2, 0.3, 0.5], + random_state=0) + target_stats = Counter(y) + X_res, y_res = sampler.fit_sample(X, y) + if isinstance(sampler, BaseOverSampler): + target_stats_res = Counter(y_res) + n_samples = max(target_stats.values()) + assert_true(all(value >= n_samples + for value in Counter(y_res).values())) + elif isinstance(sampler, BaseUnderSampler): + n_samples = min(target_stats.values()) + assert_true(all(value == n_samples + for value in Counter(y_res).values())) + elif isinstance(sampler, BaseCleaningSampler): + target_stats_res = Counter(y_res) + class_minority = min(target_stats, key=target_stats.get) + assert_true( + all(target_stats[class_sample] > target_stats_res[class_sample] + for class_sample in target_stats.keys() + if class_sample != class_minority)) + elif isinstance(sampler, BaseEnsembleSampler): + y_ensemble = y_res[0] + n_samples = min(target_stats.values()) + assert_true(all(value == n_samples + for value in Counter(y_ensemble).values())) + + +def check_samplers_ratio_fit_sample(name, Sampler): + # in this test we will force all samplers to not change the class 1 + X, y = make_classification(n_samples=1000, n_classes=3, + n_informative=4, weights=[0.2, 0.3, 0.5], + random_state=0) + target_stats = Counter(y) + sampler = Sampler(random_state=0) + if isinstance(sampler, BaseOverSampler): + ratio = {2: 498, 0: 498} + sampler.set_params(ratio=ratio) + X_res, y_res = sampler.fit_sample(X, y) + assert_equal(target_stats[1], Counter(y_res)[1]) + elif isinstance(sampler, BaseUnderSampler): + ratio = {2: 201, 0: 201} + sampler.set_params(ratio=ratio) + X_res, y_res = sampler.fit_sample(X, y) + assert_equal(target_stats[1], Counter(y_res)[1]) + elif isinstance(sampler, BaseCleaningSampler): + ratio = {2: 201, 0: 201} + sampler.set_params(ratio=ratio) + X_res, y_res = sampler.fit_sample(X, y) + assert_equal(target_stats[1], Counter(y_res)[1]) + elif isinstance(sampler, BaseEnsembleSampler): + ratio = {2: 201, 0: 201} + sampler.set_params(ratio=ratio) + X_res, y_res = sampler.fit_sample(X, y) + y_ensemble = y_res[0] + assert_equal(target_stats[1], Counter(y_ensemble)[1]) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index e7dda795d..d6190c1c7 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -21,11 +21,6 @@ # some strange ones DONT_TEST = [] -# binary samplers -BINARY_ESTIMATORS = ["BalanceCascade", "ADASYN", "SMOTE", "SMOTEENN", - "SMOTETomek", "InstanceHardnessThreshold", - "OneSidedSelection", "TomekLinks"] - def all_estimators(include_meta_estimators=False, include_other=False, type_filter=None, @@ -125,31 +120,3 @@ def is_abstract(c): # itemgetter is used to ensure the sort does not extend to the 2nd item of # the tuple return sorted(set(estimators), key=itemgetter(0)) - - -def binary_estimators(): - """Get a list of the binary estimators from imblearn. - - Returns - ------- - estimators : list of tuples - List of (name, class), where ``name`` is the class as string and - ``class`` is the actual type of the class. - """ - estimators = list(all_estimators()) - # remove the estimators which are not marked as binary - return tuple([c for c in estimators if c[0] in BINARY_ESTIMATORS]) - - -def multiclass_estimators(): - """Get a list of the multiclass estimators from imblearn. - - Returns - ------- - estimators : list of tuples - List of (name, class), where ``name`` is the class as string and - ``class`` is the actual type of the class. - """ - estimators = list(all_estimators()) - # remove the estimators which are not marked as binary - return tuple([c for c in estimators if c[0] not in BINARY_ESTIMATORS]) diff --git a/imblearn/utils/tests/test_deprecation.py b/imblearn/utils/tests/test_deprecation.py new file mode 100644 index 000000000..2fbc903a7 --- /dev/null +++ b/imblearn/utils/tests/test_deprecation.py @@ -0,0 +1,21 @@ +"""Test for the deprecation helper""" + +# Authors: Guillaume Lemaitre +# License: MIT + +from sklearn.utils.testing import assert_warns_message + +from imblearn.utils.deprecation import deprecate_parameter + + +class Sampler(object): + def __init__(self): + self.a = 'something' + self.b = 'something' + + +def test_deprecate_parameter(): + assert_warns_message(DeprecationWarning, "is deprecated from", + deprecate_parameter, Sampler(), '0.2', 'a') + assert_warns_message(DeprecationWarning, "Use 'b' instead.", + deprecate_parameter, Sampler(), '0.2', 'a', 'b') diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py index 482b0c14a..599039795 100644 --- a/imblearn/utils/tests/test_estimator_checks.py +++ b/imblearn/utils/tests/test_estimator_checks.py @@ -1,3 +1,4 @@ +"""Estimator tests - adapted from scikit-learn""" import scipy.sparse as sp import numpy as np import sys diff --git a/imblearn/utils/tests/test_testing.py b/imblearn/utils/tests/test_testing.py index acdf7256a..d116018ec 100644 --- a/imblearn/utils/tests/test_testing.py +++ b/imblearn/utils/tests/test_testing.py @@ -1,3 +1,8 @@ +"""Test for the testing module""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + from sklearn.utils.testing import assert_raises_regex from imblearn.base import SamplerMixin diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index eb5940e4b..a17337de1 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -1,8 +1,20 @@ -from imblearn.utils import check_neighbors_object +"""Test for the validation helper""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# License: MIT + +from collections import Counter + +import numpy as np from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors -from sklearn.utils.testing import assert_equal, assert_raises_regex + +from sklearn.utils.testing import (assert_equal, assert_raises_regex, + assert_warns_message) + +from imblearn.utils import check_neighbors_object +from imblearn.utils import check_ratio def test_check_neighbors_object(): @@ -19,3 +31,142 @@ def test_check_neighbors_object(): n_neighbors = 'rnd' assert_raises_regex(ValueError, "has to be one of", check_neighbors_object, name, n_neighbors) + + +def test_check_ratio_error(): + assert_raises_regex(ValueError, "'sampling_type' should be one of", + check_ratio, 'auto', np.array([1, 2, 3]), + 'rnd') + assert_raises_regex(ValueError, "The target 'y' needs to have more than 1" + " class.", check_ratio, 'auto', np.ones((10, )), + 'over-sampling') + assert_raises_regex(ValueError, "When 'ratio' is a string, it needs to be" + " one of", check_ratio, 'rnd', np.array([1, 2, 3]), + 'over-sampling') + + +def test_ratio_all_over_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = check_ratio('all', y, 'over-sampling') + assert_equal(ratio, {1: 50, 2: 0, 3: 75}) + ratio = check_ratio('auto', y, 'over-sampling') + assert_equal(ratio, {1: 50, 2: 0, 3: 75}) + + +def test_ratio_all_under_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = check_ratio('all', y, 'under-sampling') + assert_equal(ratio, {1: 25, 2: 25, 3: 25}) + + +def test_ratio_majority_over_sampling(): + assert_raises_regex(ValueError, "'ratio'='majority' cannot be used with" + " over-sampler.", check_ratio, 'majority', + np.array([1, 2, 3]), 'over-sampling') + + +def test_ratio_majority_under_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = check_ratio('majority', y, 'under-sampling') + assert_equal(ratio, {2: 25}) + + +def test_ratio_not_minority_over_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = check_ratio('not minority', y, 'over-sampling') + assert_equal(ratio, {1: 50, 2: 0}) + + +def test_ratio_not_minority_under_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = check_ratio('not minority', y, 'under-sampling') + assert_equal(ratio, {1: 25, 2: 25}) + ratio = check_ratio('auto', y, 'under-sampling') + assert_equal(ratio, {1: 25, 2: 25}) + + +def test_ratio_minority_over_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = check_ratio('minority', y, 'over-sampling') + assert_equal(ratio, {3: 75}) + + +def test_ratio_minority_under_sampling(): + assert_raises_regex(ValueError, "'ratio'='minority' cannot be used with" + " under-sampler.", check_ratio, 'minority', + np.array([1, 2, 3]), 'under-sampling') + + +def test_ratio_dict_error(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = {10: 10} + assert_raises_regex(ValueError, "are not present in the data.", + check_ratio, ratio, y, 'over-sampling') + ratio = {1: 45, 2: 100, 3: 70} + assert_raises_regex(ValueError, "With over-sampling methods, the number" + " of samples in a class should be greater or equal" + " to the original number of samples. Originally," + " there is 50 samples and 45 samples are asked.", + check_ratio, ratio, y, 'over-sampling') + assert_raises_regex(ValueError, "With under-sampling methods, the number" + " of samples in a class should be less or equal" + " to the original number of samples. Originally," + " there is 25 samples and 70 samples are asked.", + check_ratio, ratio, y, 'under-sampling') + + +def test_ratio_dict_over_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = {1: 70, 2: 100, 3: 70} + ratio_ = check_ratio(ratio, y, 'over-sampling') + assert_equal(ratio_, {1: 20, 2: 0, 3: 45}) + ratio = {1: 70, 2: 140, 3: 70} + assert_warns_message(UserWarning, "After over-sampling, the number of" + " samples (140) in class 2 will be larger than the" + " number of samples in the majority class (class #2" + " -> 100)", check_ratio, ratio, y, 'over-sampling') + + +def test_ratio_dict_under_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = {1: 30, 2: 45, 3: 25} + ratio_ = check_ratio(ratio, y, 'under-sampling') + assert_equal(ratio_, ratio) + + +def test_ratio_float_error(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = -10 + assert_raises_regex(ValueError, "When 'ratio' is a float, it should in the" + " range", check_ratio, ratio, y, 'under-sampling') + ratio = 10 + assert_raises_regex(ValueError, "When 'ratio' is a float, it should in the" + " range", check_ratio, ratio, y, 'under-sampling') + + +def test_ratio_float_over_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = 0.5 + ratio_ = check_ratio(ratio, y, 'over-sampling') + assert_equal(ratio_, {1: 0, 3: 25}) + + +def test_ratio_float_under_sampling(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + ratio = 0.5 + ratio_ = check_ratio(ratio, y, 'under-sampling') + assert_equal(ratio_, {1: 50, 2: 50}) + + +def test_ratio_callable(): + y = np.array([1] * 50 + [2] * 100 + [3] * 25) + + def ratio_func(y): + # this function could create an equal number of samples + target_stats = Counter(y) + n_samples = max(target_stats.values()) + return {key: int(n_samples) + for key in target_stats.keys()} + + ratio_ = check_ratio(ratio_func, y, 'over-sampling') + assert_equal(ratio_, {1: 50, 2: 0, 3: 75}) diff --git a/imblearn/utils/validation.py b/imblearn/utils/validation.py index 3d3c19aff..002c05abf 100644 --- a/imblearn/utils/validation.py +++ b/imblearn/utils/validation.py @@ -2,12 +2,24 @@ # Authors: Guillaume Lemaitre # License: MIT +import warnings +from collections import Counter +from numbers import Real, Integral + +import numpy as np from sklearn.neighbors.base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors +from sklearn.externals import six, joblib +from sklearn.utils import deprecated +from sklearn.utils.multiclass import type_of_target from ..exceptions import raise_isinstance_error +SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling', + 'ensemble') +TARGET_KIND = ('binary', 'multiclass') + def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. @@ -33,9 +45,285 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): nn_object : KNeighborsMixin The k-NN object. """ - if isinstance(nn_object, int): + if isinstance(nn_object, Integral): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) elif isinstance(nn_object, KNeighborsMixin): return nn_object else: raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) + + +def check_target_type(y): + """Check the target types to be conform to the current samplers. + + The current samplers should be compatible with ``'binary'`` and + ``'multiclass'`` targets only. + + Parameters + ---------- + y : ndarray, + The array containing the target + + Returns + ------- + y : ndarray, + The returned target. + + """ + if type_of_target(y) not in TARGET_KIND: + # FIXME: perfectly we should raise an error but the sklearn API does + # not allow for it + warnings.warn("'y' should be of types {} only. Got {} instead.".format( + TARGET_KIND, type_of_target(y))) + return y + + +def hash_X_y(X, y, n_samples=1000): + """Compute hash of the input arrays. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + The ``X`` array. + + y : ndarray, shape (n_samples) + + Returns + ------- + X_hash: str + Hash identifier of the ``X`` matrix. + + y_hash: str + Hash identifier of the ``y`` matrix. + """ + rng = np.random.RandomState(0) + raw_idx = rng.randint(X.shape[0], size=n_samples) + col_idx = rng.randint(X.shape[1], size=n_samples) + + return joblib.hash(X[raw_idx, col_idx]), joblib.hash(y[raw_idx]) + + +def _ratio_all(y, sampling_type): + """Returns ratio by targeting all classes.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + ratio = {key: n_sample_majority - value + for (key, value) in target_stats.items()} + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + n_sample_minority = min(target_stats.values()) + ratio = {key: n_sample_minority for key in target_stats.keys()} + + return ratio + + +def _ratio_majority(y, sampling_type): + """Returns ratio by targeting the majority class only.""" + if sampling_type == 'over-sampling': + raise ValueError("'ratio'='majority' cannot be used with" + " over-sampler.") + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + target_stats = Counter(y) + class_majority = max(target_stats, key=target_stats.get) + n_sample_minority = min(target_stats.values()) + ratio = {key: n_sample_minority + for key in target_stats.keys() + if key == class_majority} + + return ratio + + +def _ratio_not_minority(y, sampling_type): + """Returns ratio by targeting all classes but not the minority.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + ratio = {key: n_sample_majority - value + for (key, value) in target_stats.items() + if key != class_minority} + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + n_sample_minority = min(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + ratio = {key: n_sample_minority + for key in target_stats.keys() + if key != class_minority} + + return ratio + + +def _ratio_minority(y, sampling_type): + """Returns ratio by targeting the minority class only.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + ratio = {key: n_sample_majority - value + for (key, value) in target_stats.items() + if key == class_minority} + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + raise ValueError("'ratio'='minority' cannot be used with" + " under-sampler and clean-sampler.") + + return ratio + + +def _ratio_auto(y, sampling_type): + """Returns ratio auto for over-sampling and not-minority for + under-sampling.""" + if sampling_type == 'over-sampling': + return _ratio_all(y, sampling_type) + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + return _ratio_not_minority(y, sampling_type) + + +def _ratio_dict(ratio, y, sampling_type): + """Returns ratio by converting the dictionary depending of the sampling.""" + target_stats = Counter(y) + # check that all keys in ratio are also in y + set_diff_ratio_target = set(ratio.keys()) - set(target_stats.keys()) + if len(set_diff_ratio_target) > 0: + raise ValueError("The {} target class is/are not present in the" + " data.".format(set_diff_ratio_target)) + ratio_ = {} + if sampling_type == 'over-sampling': + n_samples_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + for class_sample, n_samples in ratio.items(): + if n_samples < target_stats[class_sample]: + raise ValueError("With over-sampling methods, the number" + " of samples in a class should be greater" + " or equal to the original number of samples." + " Originally, there is {} samples and {}" + " samples are asked.".format( + target_stats[class_sample], n_samples)) + if n_samples > n_samples_majority: + warnings.warn("After over-sampling, the number of samples ({})" + " in class {} will be larger than the number of" + " samples in the majority class (class #{} ->" + " {})".format(n_samples, class_sample, + class_majority, + n_samples_majority)) + ratio_[class_sample] = n_samples - target_stats[class_sample] + elif sampling_type == 'under-sampling': + for class_sample, n_samples in ratio.items(): + if n_samples > target_stats[class_sample]: + raise ValueError("With under-sampling methods, the number of" + " samples in a class should be less or equal" + " to the original number of samples." + " Originally, there is {} samples and {}" + " samples are asked.".format( + target_stats[class_sample], n_samples)) + ratio_[class_sample] = n_samples + elif sampling_type == 'clean-sampling': + # clean-sampling can be more permissive since those samplers do not + # use samples + for class_sample, n_samples in ratio.items(): + ratio_[class_sample] = n_samples + + return ratio_ + + +@deprecated("Use a float for 'ratio' is deprecated from version 0.2." + " The support will be removed in 0.4. Use a dict, str," + " or a callable instead.") +def _ratio_float(ratio, y, sampling_type): + """TODO: Deprecated in 0.2. Remove in 0.4.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + ratio = {key: int(n_sample_majority * ratio - value) + for (key, value) in target_stats.items() + if key != class_majority} + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + n_sample_minority = min(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + ratio = {key: int(n_sample_minority / ratio) + for (key, value) in target_stats.items() + if key != class_minority} + + return ratio + + +def check_ratio(ratio, y, sampling_type): + """Ratio validation for samplers. + + Checks ratio for consistent type and return a dictionary + containing each targeted class with its corresponding number of + pixel. + + Parameters + ---------- + ratio : str, dict or callable, + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + + y : ndarray, shape (n_samples,) + The target array. + + sampling_type : str, + The type of sampling. Can be either ``'over-sampling'`` or + ``'under-sampling'``. + + Returns + ------- + ratio_converted : dict, + The converted and validated ratio. Returns a dictionary with + the key being the class target and the value being the desired + number of samples. + + """ + if sampling_type not in SAMPLING_KIND: + raise ValueError("'sampling_type' should be one of {}. Got '{}'" + " instead.".format(SAMPLING_KIND, sampling_type)) + + if np.unique(y).size <= 1: + raise ValueError("The target 'y' needs to have more than 1 class." + " Got {} class instead".format(np.unique(y).size)) + + if sampling_type == 'ensemble': + return ratio + + if isinstance(ratio, six.string_types): + if ratio not in RATIO_KIND.keys(): + raise ValueError("When 'ratio' is a string, it needs to be one of" + " {}. Got '{}' instead.".format(RATIO_KIND, + ratio)) + return RATIO_KIND[ratio](y, sampling_type) + elif isinstance(ratio, dict): + return _ratio_dict(ratio, y, sampling_type) + elif isinstance(ratio, Real): + if ratio <= 0 or ratio > 1: + raise ValueError("When 'ratio' is a float, it should in the range" + " (0, 1]. Got {} instead.".format(ratio)) + return _ratio_float(ratio, y, sampling_type) + elif callable(ratio): + ratio_ = ratio(y) + return _ratio_dict(ratio_, y, sampling_type) + + +RATIO_KIND = {'minority': _ratio_minority, + 'majority': _ratio_majority, + 'not minority': _ratio_not_minority, + 'all': _ratio_all, + 'auto': _ratio_auto}