EHN: Add a collection of imbalanced datasets (#249)

glemaitre · chkoar · commit 3c54ea66b784 · 2017-04-06T13:24:26.000+03:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -158,7 +158,7 @@ Functions
    :toctree: generated/
 
    datasets.make_imbalance
-
+   datasets.fetch_datasets
 
 Utilities
 =========
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -20,6 +20,8 @@ New features
 
 - Turn off steps in :class:`pipeline.Pipeline` using the `None`
   object. By `Christos Aridas`_.
+- Add a fetching function `datasets.fetch_datasets` in order to get some
+  imbalanced datasets useful for benchmarking. By `Guillaume Lemaitre`_.
 
 Enhancement
 ~~~~~~~~~~~
diff --git a/imblearn/datasets/__init__.py b/imblearn/datasets/__init__.py
@@ -5,4 +5,7 @@
 
 from .imbalance import make_imbalance
 
-__all__ = ['make_imbalance']
+from .zenodo import fetch_datasets
+
+__all__ = ['make_imbalance',
+           'fetch_datasets']
diff --git a/imblearn/datasets/tests/test_zenodo.py b/imblearn/datasets/tests/test_zenodo.py
@@ -0,0 +1,92 @@
+"""Test the datasets loader.
+
+Skipped if datasets is not already downloaded to data_home.
+"""
+from imblearn.datasets import fetch_datasets
+from sklearn.utils.testing import (assert_equal, assert_allclose,
+                                   assert_raises_regex, SkipTest)
+
+DATASET_SHAPE = {'ecoli': (336, 7),
+                 'optical_digits': (5620, 64),
+                 'satimage': (6435, 36),
+                 'pen_digits': (10992, 16),
+                 'abalone': (4177, 10),
+                 'sick_euthyroid': (3163, 42),
+                 'spectrometer': (531, 93),
+                 'car_eval_34': (1728, 21),
+                 'isolet': (7797, 617),
+                 'us_crime': (1994, 100),
+                 'yeast_ml8': (2417, 103),
+                 'scene': (2407, 294),
+                 'libras_move': (360, 90),
+                 'thyroid_sick': (3772, 52),
+                 'coil_2000': (9822, 85),
+                 'arrhythmia': (452, 278),
+                 'solar_flare_m0': (1389, 32),
+                 'oil': (937, 49),
+                 'car_eval_4': (1728, 21),
+                 'wine_quality': (4898, 11),
+                 'letter_img': (20000, 16),
+                 'yeast_me2': (1484, 8),
+                 'webpage': (34780, 300),
+                 'ozone_level': (2536, 72),
+                 'mammography': (11183, 6),
+                 'protein_homo': (145751, 74),
+                 'abalone_19': (4177, 10)}
+
+
+def fetch(*args, **kwargs):
+    return fetch_datasets(*args, download_if_missing=True, **kwargs)
+
+
+def test_fetch():
+    try:
+        datasets1 = fetch(shuffle=True, random_state=42)
+    except IOError:
+        raise SkipTest("Zenodo dataset can not be loaded.")
+
+    datasets2 = fetch(shuffle=True, random_state=37)
+
+    for k in DATASET_SHAPE.keys():
+
+        X1, X2 = datasets1[k].data, datasets2[k].data
+        assert_equal(DATASET_SHAPE[k], X1.shape)
+        assert_equal(X1.shape, X2.shape)
+
+        assert_allclose(X1.sum(), X2.sum())
+
+        y1, y2 = datasets1[k].target, datasets2[k].target
+        assert_equal((X1.shape[0],), y1.shape)
+        assert_equal((X1.shape[0],), y2.shape)
+
+
+def test_fetch_filter():
+    try:
+        datasets1 = fetch(filter_data=tuple([1]), shuffle=True,
+                          random_state=42)
+    except IOError:
+        raise SkipTest("Zenodo dataset can not be loaded.")
+
+    datasets2 = fetch(filter_data=tuple(['ecoli']), shuffle=True,
+                      random_state=37)
+
+    X1, X2 = datasets1['ecoli'].data, datasets2['ecoli'].data
+    assert_equal(DATASET_SHAPE['ecoli'], X1.shape)
+    assert_equal(X1.shape, X2.shape)
+
+    assert_allclose(X1.sum(), X2.sum())
+
+    y1, y2 = datasets1['ecoli'].target, datasets2['ecoli'].target
+    assert_equal((X1.shape[0],), y1.shape)
+    assert_equal((X1.shape[0],), y2.shape)
+
+
+def test_fetch_error():
+    assert_raises_regex(ValueError, 'is not a dataset available.',
+                        fetch_datasets, filter_data=tuple(['rnd']))
+    assert_raises_regex(ValueError, 'dataset with the ID=',
+                        fetch_datasets, filter_data=tuple([-1]))
+    assert_raises_regex(ValueError, 'dataset with the ID=',
+                        fetch_datasets, filter_data=tuple([100]))
+    assert_raises_regex(ValueError, 'value in the tuple',
+                        fetch_datasets, filter_data=tuple([1.00]))
diff --git a/imblearn/datasets/zenodo.py b/imblearn/datasets/zenodo.py
@@ -0,0 +1,279 @@
+"""Collection of imbalanced datasets.
+
+This collection of datasets has been proposed in [1]_. The
+characteristics of the available datasets are presented in the table
+below.
+
+ ID    Name           Repository & Target           Ratio  #S       #F
+ 1     ecoli          UCI, target: imU              8.6:1  336      7
+ 2     optical_digits UCI, target: 8                9.1:1  5,620    64
+ 3     satimage       UCI, target: 4                9.3:1  6,435    36
+ 4     pen_digits     UCI, target: 5                9.4:1  10,992   16
+ 5     abalone        UCI, target: 7                9.7:1  4,177    10
+ 6     sick_euthyroid UCI, target: sick euthyroid   9.8:1  3,163    42
+ 7     spectrometer   UCI, target: >=44             11:1   531      93
+ 8     car_eval_34    UCI, target: good, v good     12:1   1,728    21
+ 9     isolet         UCI, target: A, B             12:1   7,797    617
+ 10    us_crime       UCI, target: >0.65            12:1   1,994    100
+ 11    yeast_ml8      LIBSVM, target: 8             13:1   2,417    103
+ 12    scene          LIBSVM, target: >one label    13:1   2,407    294
+ 13    libras_move    UCI, target: 1                14:1   360      90
+ 14    thyroid_sick   UCI, target: sick             15:1   3,772    52
+ 15    coil_2000      KDD, CoIL, target: minority   16:1   9,822    85
+ 16    arrhythmia     UCI, target: 06               17:1   452      278
+ 17    solar_flare_m0 UCI, target: M->0             19:1   1,389    32
+ 18    oil            UCI, target: minority         22:1   937      49
+ 19    car_eval_4     UCI, target: vgood            26:1   1,728    21
+ 20    wine_quality   UCI, wine, target: <=4        26:1   4,898    11
+ 21    letter_img     UCI, target: Z                26:1   20,000   16
+ 22    yeast_me2      UCI, target: ME2              28:1   1,484    8
+ 23    webpage        LIBSVM, w7a, target: minority 33:1   34,780   300
+ 24    ozone_level    UCI, ozone, data              34:1   2,536    72
+ 25    mammography    UCI, target: minority         42:1   11,183   6
+ 26    protein_homo   KDD CUP 2004, minority        111:1  145,751  74
+ 27    abalone_19     UCI, target: 19               130:1  4,177    10
+
+References
+----------
+.. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
+   Imbalanced Data Learning and their Application in Bioinformatics."
+   Dissertation, Georgia State University, (2011).
+
+"""
+
+# Author: Guillaume Lemaitre
+# License: BSD 3 clause
+
+from collections import OrderedDict
+import tarfile
+from io import BytesIO
+import logging
+from os.path import join, isfile
+try:
+    from urllib2 import urlopen
+except ImportError:
+    from urllib.request import urlopen
+
+import numpy as np
+
+from sklearn.datasets import get_data_home
+from sklearn.datasets.base import Bunch
+from sklearn.utils.fixes import makedirs
+from sklearn.externals import six
+from sklearn.utils import check_random_state
+
+URL = ('https://zenodo.org/record/61452/files/'
+       'benchmark-imbalanced-learn.tar.gz')
+PRE_FILENAME = 'x'
+POST_FILENAME = 'data.npz'
+
+MAP_NAME_ID_KEYS = ['ecoli',
+                    'optical_digits',
+                    'satimage',
+                    'pen_digits',
+                    'abalone',
+                    'sick_euthyroid',
+                    'spectrometer',
+                    'car_eval_34',
+                    'isolet',
+                    'us_crime',
+                    'yeast_ml8',
+                    'scene',
+                    'libras_move',
+                    'thyroid_sick',
+                    'coil_2000',
+                    'arrhythmia',
+                    'solar_flare_m0',
+                    'oil',
+                    'car_eval_4',
+                    'wine_quality',
+                    'letter_img',
+                    'yeast_me2',
+                    'webpage',
+                    'ozone_level',
+                    'mammography',
+                    'protein_homo',
+                    'abalone_19']
+
+MAP_NAME_ID = OrderedDict()
+MAP_ID_NAME = OrderedDict()
+for v, k in enumerate(MAP_NAME_ID_KEYS):
+    MAP_NAME_ID[k] = v + 1
+    MAP_ID_NAME[v + 1] = k
+
+logger = logging.getLogger()
+
+
+def fetch_datasets(data_home=None,
+                 filter_data=None,
+                 download_if_missing=True,
+                 random_state=None,
+                 shuffle=False):
+    """Load the benchmark datasets from Zenodo, downloading it if necessary.
+
+    Parameters
+    ----------
+    data_home : string, optional (default=None)
+        Specify another download and cache folder for the datasets. By default
+        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
+
+    filter_data : tuple of str/int or None, optional (default=None)
+        A tuple containing the ID or the name of the datasets to be returned.
+        Refer to the above table to get the ID and name of the datasets.
+
+    download_if_missing : boolean, optional (default=True)
+        If False, raise a IOError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        Random state for shuffling the dataset.
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    shuffle : bool, optional (default=False)
+        Whether to shuffle dataset.
+
+    Returns
+    -------
+    datasets : OrderedDict of Bunch object,
+        The ordered is defined by ``filter_data``. Each Bunch object ---
+        refered as dataset --- have the following attributes:
+
+    dataset.data : ndarray, shape (n_samples, n_features)
+
+    dataset.target : ndarray, shape (n_samples, )
+
+    dataset.DESCR : string
+        Description of the each dataset.
+
+    Notes
+    -----
+    This collection of datasets have been proposed in [1]_. The
+    characteristics of the available datasets are presented in the table
+    below.
+
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |ID|Name          | Repository & Target           | Ratio | #S      | #F  |
+    +==+==============+===============================+=======+=========+=====+
+    |1 |ecoli         | UCI, target: imU              | 8.6:1 | 336     | 7   |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |2 |optical_digits| UCI, target: 8                | 9.1:1 | 5,620   | 64  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |3 |satimage      | UCI, target: 4                | 9.3:1 | 6,435   | 36  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |4 |pen_digits    | UCI, target: 5                | 9.4:1 | 10,992  | 16  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |5 |abalone       | UCI, target: 7                | 9.7:1 | 4,177   | 10  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |6 |sick_euthyroid| UCI, target: sick euthyroid   | 9.8:1 | 3,163   | 42  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |7 |spectrometer  | UCI, target: >=44             | 11:1  | 531     | 93  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |8 |car_eval_34   | UCI, target: good, v good     | 12:1  | 1,728   | 21  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |9 |isolet        | UCI, target: A, B             | 12:1  | 7,797   | 617 |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |10|us_crime      | UCI, target: >0.65            | 12:1  | 1,994   | 100 |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |11|yeast_ml8     | LIBSVM, target: 8             | 13:1  | 2,417   | 103 |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |12|scene         | LIBSVM, target: >one label    | 13:1  | 2,407   | 294 |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |13|libras_move   | UCI, target: 1                | 14:1  | 360     | 90  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |14|thyroid_sick  | UCI, target: sick             | 15:1  | 3,772   | 52  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |15|coil_2000     | KDD, CoIL, target: minority   | 16:1  | 9,822   | 85  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |16|arrhythmia    | UCI, target: 06               | 17:1  | 452     | 278 |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |17|solar_flare_m0| UCI, target: M->0             | 19:1  | 1,389   | 32  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |18|oil           | UCI, target: minority         | 22:1  | 937     | 49  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |19|car_eval_4    | UCI, target: vgood            | 26:1  | 1,728   | 21  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |20|wine_quality  | UCI, wine, target: <=4        | 26:1  | 4,898   | 11  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |21|letter_img    | UCI, target: Z                | 26:1  | 20,000  | 16  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |22|yeast_me2     | UCI, target: ME2              | 28:1  | 1,484   | 8   |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |23|webpage       | LIBSVM, w7a, target: minority | 33:1  | 34,780  | 300 |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |24|ozone_level   | UCI, ozone, data              | 34:1  | 2,536   | 72  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |25|mammography   | UCI, target: minority         | 42:1  | 11,183  | 6   |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |26|protein_homo  | KDD CUP 2004, minority        | 11:1  | 145,751 | 74  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+    |27|abalone_19    | UCI, target: 19               | 130:1 | 4,177   | 10  |
+    +--+--------------+-------------------------------+-------+---------+-----+
+
+    References
+    ----------
+    .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
+       Imbalanced Data Learning and their Application in Bioinformatics."
+       Dissertation, Georgia State University, (2011).
+    """
+
+    data_home = get_data_home(data_home=data_home)
+    zenodo_dir = join(data_home, "zenodo")
+    datasets = OrderedDict()
+
+    if filter_data is None:
+        filter_data_ = MAP_NAME_ID.keys()
+    else:
+        list_data = MAP_NAME_ID.keys()
+        filter_data_ = []
+        for it in filter_data:
+            if isinstance(it, six.string_types):
+                if it not in list_data:
+                    raise ValueError('{} is not a dataset available. '
+                                     'The available datasets are {}'.format(
+                                         it, list_data))
+                else:
+                    filter_data_.append(it)
+            elif isinstance(it, int):
+                if it < 1 or it > 27:
+                    raise ValueError('The dataset with the ID={} is not an '
+                                     'available dataset. The IDs are '
+                                     '{}'.format(it, range(1, 28)))
+                else:
+                    # The index start at one, then we need to remove one
+                    # to not have issue with the indexing.
+                    filter_data_.append(MAP_ID_NAME[it])
+            else:
+                raise ValueError('The value in the tuple should be str or int.'
+                                 ' Got {} instead.'.format(type(it)))
+
+    # go through the list and check if the data are available
+    for it in filter_data_:
+        filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME
+        filename = join(zenodo_dir, filename)
+        available = isfile(filename)
+
+        if download_if_missing and not available:
+            makedirs(zenodo_dir, exist_ok=True)
+            logger.warning("Downloading %s" % URL)
+            f = BytesIO(urlopen(URL).read())
+            tar = tarfile.open(fileobj=f)
+            tar.extractall(path=zenodo_dir)
+        elif not download_if_missing and not available:
+            raise IOError("Data not found and `download_if_missing` is False")
+
+        data = np.load(filename)
+        X, y = data['data'], data['label']
+
+        if shuffle:
+            ind = np.arange(X.shape[0])
+            rng = check_random_state(random_state)
+            rng.shuffle(ind)
+            X = X[ind]
+            y = y[ind]
+
+        datasets[it] = Bunch(data=X, target=y, DESCR=it)
+
+    return datasets