Skip to content

Commit 3c54ea6

Browse files
glemaitrechkoar
authored andcommitted
EHN: Add a collection of imbalanced datasets (#249)
1 parent e7e0baa commit 3c54ea6

File tree

5 files changed

+378
-2
lines changed

5 files changed

+378
-2
lines changed

doc/api.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ Functions
158158
:toctree: generated/
159159

160160
datasets.make_imbalance
161-
161+
datasets.fetch_datasets
162162

163163
Utilities
164164
=========

doc/whats_new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ New features
2020

2121
- Turn off steps in :class:`pipeline.Pipeline` using the `None`
2222
object. By `Christos Aridas`_.
23+
- Add a fetching function `datasets.fetch_datasets` in order to get some
24+
imbalanced datasets useful for benchmarking. By `Guillaume Lemaitre`_.
2325

2426
Enhancement
2527
~~~~~~~~~~~

imblearn/datasets/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,7 @@
55

66
from .imbalance import make_imbalance
77

8-
__all__ = ['make_imbalance']
8+
from .zenodo import fetch_datasets
9+
10+
__all__ = ['make_imbalance',
11+
'fetch_datasets']
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""Test the datasets loader.
2+
3+
Skipped if datasets is not already downloaded to data_home.
4+
"""
5+
from imblearn.datasets import fetch_datasets
6+
from sklearn.utils.testing import (assert_equal, assert_allclose,
7+
assert_raises_regex, SkipTest)
8+
9+
DATASET_SHAPE = {'ecoli': (336, 7),
10+
'optical_digits': (5620, 64),
11+
'satimage': (6435, 36),
12+
'pen_digits': (10992, 16),
13+
'abalone': (4177, 10),
14+
'sick_euthyroid': (3163, 42),
15+
'spectrometer': (531, 93),
16+
'car_eval_34': (1728, 21),
17+
'isolet': (7797, 617),
18+
'us_crime': (1994, 100),
19+
'yeast_ml8': (2417, 103),
20+
'scene': (2407, 294),
21+
'libras_move': (360, 90),
22+
'thyroid_sick': (3772, 52),
23+
'coil_2000': (9822, 85),
24+
'arrhythmia': (452, 278),
25+
'solar_flare_m0': (1389, 32),
26+
'oil': (937, 49),
27+
'car_eval_4': (1728, 21),
28+
'wine_quality': (4898, 11),
29+
'letter_img': (20000, 16),
30+
'yeast_me2': (1484, 8),
31+
'webpage': (34780, 300),
32+
'ozone_level': (2536, 72),
33+
'mammography': (11183, 6),
34+
'protein_homo': (145751, 74),
35+
'abalone_19': (4177, 10)}
36+
37+
38+
def fetch(*args, **kwargs):
39+
return fetch_datasets(*args, download_if_missing=True, **kwargs)
40+
41+
42+
def test_fetch():
43+
try:
44+
datasets1 = fetch(shuffle=True, random_state=42)
45+
except IOError:
46+
raise SkipTest("Zenodo dataset can not be loaded.")
47+
48+
datasets2 = fetch(shuffle=True, random_state=37)
49+
50+
for k in DATASET_SHAPE.keys():
51+
52+
X1, X2 = datasets1[k].data, datasets2[k].data
53+
assert_equal(DATASET_SHAPE[k], X1.shape)
54+
assert_equal(X1.shape, X2.shape)
55+
56+
assert_allclose(X1.sum(), X2.sum())
57+
58+
y1, y2 = datasets1[k].target, datasets2[k].target
59+
assert_equal((X1.shape[0],), y1.shape)
60+
assert_equal((X1.shape[0],), y2.shape)
61+
62+
63+
def test_fetch_filter():
64+
try:
65+
datasets1 = fetch(filter_data=tuple([1]), shuffle=True,
66+
random_state=42)
67+
except IOError:
68+
raise SkipTest("Zenodo dataset can not be loaded.")
69+
70+
datasets2 = fetch(filter_data=tuple(['ecoli']), shuffle=True,
71+
random_state=37)
72+
73+
X1, X2 = datasets1['ecoli'].data, datasets2['ecoli'].data
74+
assert_equal(DATASET_SHAPE['ecoli'], X1.shape)
75+
assert_equal(X1.shape, X2.shape)
76+
77+
assert_allclose(X1.sum(), X2.sum())
78+
79+
y1, y2 = datasets1['ecoli'].target, datasets2['ecoli'].target
80+
assert_equal((X1.shape[0],), y1.shape)
81+
assert_equal((X1.shape[0],), y2.shape)
82+
83+
84+
def test_fetch_error():
85+
assert_raises_regex(ValueError, 'is not a dataset available.',
86+
fetch_datasets, filter_data=tuple(['rnd']))
87+
assert_raises_regex(ValueError, 'dataset with the ID=',
88+
fetch_datasets, filter_data=tuple([-1]))
89+
assert_raises_regex(ValueError, 'dataset with the ID=',
90+
fetch_datasets, filter_data=tuple([100]))
91+
assert_raises_regex(ValueError, 'value in the tuple',
92+
fetch_datasets, filter_data=tuple([1.00]))

imblearn/datasets/zenodo.py

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
"""Collection of imbalanced datasets.
2+
3+
This collection of datasets has been proposed in [1]_. The
4+
characteristics of the available datasets are presented in the table
5+
below.
6+
7+
ID Name Repository & Target Ratio #S #F
8+
1 ecoli UCI, target: imU 8.6:1 336 7
9+
2 optical_digits UCI, target: 8 9.1:1 5,620 64
10+
3 satimage UCI, target: 4 9.3:1 6,435 36
11+
4 pen_digits UCI, target: 5 9.4:1 10,992 16
12+
5 abalone UCI, target: 7 9.7:1 4,177 10
13+
6 sick_euthyroid UCI, target: sick euthyroid 9.8:1 3,163 42
14+
7 spectrometer UCI, target: >=44 11:1 531 93
15+
8 car_eval_34 UCI, target: good, v good 12:1 1,728 21
16+
9 isolet UCI, target: A, B 12:1 7,797 617
17+
10 us_crime UCI, target: >0.65 12:1 1,994 100
18+
11 yeast_ml8 LIBSVM, target: 8 13:1 2,417 103
19+
12 scene LIBSVM, target: >one label 13:1 2,407 294
20+
13 libras_move UCI, target: 1 14:1 360 90
21+
14 thyroid_sick UCI, target: sick 15:1 3,772 52
22+
15 coil_2000 KDD, CoIL, target: minority 16:1 9,822 85
23+
16 arrhythmia UCI, target: 06 17:1 452 278
24+
17 solar_flare_m0 UCI, target: M->0 19:1 1,389 32
25+
18 oil UCI, target: minority 22:1 937 49
26+
19 car_eval_4 UCI, target: vgood 26:1 1,728 21
27+
20 wine_quality UCI, wine, target: <=4 26:1 4,898 11
28+
21 letter_img UCI, target: Z 26:1 20,000 16
29+
22 yeast_me2 UCI, target: ME2 28:1 1,484 8
30+
23 webpage LIBSVM, w7a, target: minority 33:1 34,780 300
31+
24 ozone_level UCI, ozone, data 34:1 2,536 72
32+
25 mammography UCI, target: minority 42:1 11,183 6
33+
26 protein_homo KDD CUP 2004, minority 111:1 145,751 74
34+
27 abalone_19 UCI, target: 19 130:1 4,177 10
35+
36+
References
37+
----------
38+
.. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
39+
Imbalanced Data Learning and their Application in Bioinformatics."
40+
Dissertation, Georgia State University, (2011).
41+
42+
"""
43+
44+
# Author: Guillaume Lemaitre
45+
# License: BSD 3 clause
46+
47+
from collections import OrderedDict
48+
import tarfile
49+
from io import BytesIO
50+
import logging
51+
from os.path import join, isfile
52+
try:
53+
from urllib2 import urlopen
54+
except ImportError:
55+
from urllib.request import urlopen
56+
57+
import numpy as np
58+
59+
from sklearn.datasets import get_data_home
60+
from sklearn.datasets.base import Bunch
61+
from sklearn.utils.fixes import makedirs
62+
from sklearn.externals import six
63+
from sklearn.utils import check_random_state
64+
65+
URL = ('https://zenodo.org/record/61452/files/'
66+
'benchmark-imbalanced-learn.tar.gz')
67+
PRE_FILENAME = 'x'
68+
POST_FILENAME = 'data.npz'
69+
70+
MAP_NAME_ID_KEYS = ['ecoli',
71+
'optical_digits',
72+
'satimage',
73+
'pen_digits',
74+
'abalone',
75+
'sick_euthyroid',
76+
'spectrometer',
77+
'car_eval_34',
78+
'isolet',
79+
'us_crime',
80+
'yeast_ml8',
81+
'scene',
82+
'libras_move',
83+
'thyroid_sick',
84+
'coil_2000',
85+
'arrhythmia',
86+
'solar_flare_m0',
87+
'oil',
88+
'car_eval_4',
89+
'wine_quality',
90+
'letter_img',
91+
'yeast_me2',
92+
'webpage',
93+
'ozone_level',
94+
'mammography',
95+
'protein_homo',
96+
'abalone_19']
97+
98+
MAP_NAME_ID = OrderedDict()
99+
MAP_ID_NAME = OrderedDict()
100+
for v, k in enumerate(MAP_NAME_ID_KEYS):
101+
MAP_NAME_ID[k] = v + 1
102+
MAP_ID_NAME[v + 1] = k
103+
104+
logger = logging.getLogger()
105+
106+
107+
def fetch_datasets(data_home=None,
108+
filter_data=None,
109+
download_if_missing=True,
110+
random_state=None,
111+
shuffle=False):
112+
"""Load the benchmark datasets from Zenodo, downloading it if necessary.
113+
114+
Parameters
115+
----------
116+
data_home : string, optional (default=None)
117+
Specify another download and cache folder for the datasets. By default
118+
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
119+
120+
filter_data : tuple of str/int or None, optional (default=None)
121+
A tuple containing the ID or the name of the datasets to be returned.
122+
Refer to the above table to get the ID and name of the datasets.
123+
124+
download_if_missing : boolean, optional (default=True)
125+
If False, raise a IOError if the data is not locally available
126+
instead of trying to download the data from the source site.
127+
128+
random_state : int, RandomState instance or None, optional (default=None)
129+
Random state for shuffling the dataset.
130+
If int, random_state is the seed used by the random number generator;
131+
If RandomState instance, random_state is the random number generator;
132+
If None, the random number generator is the RandomState instance used
133+
by `np.random`.
134+
135+
shuffle : bool, optional (default=False)
136+
Whether to shuffle dataset.
137+
138+
Returns
139+
-------
140+
datasets : OrderedDict of Bunch object,
141+
The ordered is defined by ``filter_data``. Each Bunch object ---
142+
refered as dataset --- have the following attributes:
143+
144+
dataset.data : ndarray, shape (n_samples, n_features)
145+
146+
dataset.target : ndarray, shape (n_samples, )
147+
148+
dataset.DESCR : string
149+
Description of the each dataset.
150+
151+
Notes
152+
-----
153+
This collection of datasets have been proposed in [1]_. The
154+
characteristics of the available datasets are presented in the table
155+
below.
156+
157+
+--+--------------+-------------------------------+-------+---------+-----+
158+
|ID|Name | Repository & Target | Ratio | #S | #F |
159+
+==+==============+===============================+=======+=========+=====+
160+
|1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 |
161+
+--+--------------+-------------------------------+-------+---------+-----+
162+
|2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 |
163+
+--+--------------+-------------------------------+-------+---------+-----+
164+
|3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 |
165+
+--+--------------+-------------------------------+-------+---------+-----+
166+
|4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 |
167+
+--+--------------+-------------------------------+-------+---------+-----+
168+
|5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 |
169+
+--+--------------+-------------------------------+-------+---------+-----+
170+
|6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 |
171+
+--+--------------+-------------------------------+-------+---------+-----+
172+
|7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 |
173+
+--+--------------+-------------------------------+-------+---------+-----+
174+
|8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 |
175+
+--+--------------+-------------------------------+-------+---------+-----+
176+
|9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 |
177+
+--+--------------+-------------------------------+-------+---------+-----+
178+
|10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 |
179+
+--+--------------+-------------------------------+-------+---------+-----+
180+
|11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 |
181+
+--+--------------+-------------------------------+-------+---------+-----+
182+
|12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 |
183+
+--+--------------+-------------------------------+-------+---------+-----+
184+
|13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 |
185+
+--+--------------+-------------------------------+-------+---------+-----+
186+
|14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 |
187+
+--+--------------+-------------------------------+-------+---------+-----+
188+
|15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 |
189+
+--+--------------+-------------------------------+-------+---------+-----+
190+
|16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 |
191+
+--+--------------+-------------------------------+-------+---------+-----+
192+
|17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 |
193+
+--+--------------+-------------------------------+-------+---------+-----+
194+
|18|oil | UCI, target: minority | 22:1 | 937 | 49 |
195+
+--+--------------+-------------------------------+-------+---------+-----+
196+
|19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 |
197+
+--+--------------+-------------------------------+-------+---------+-----+
198+
|20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 |
199+
+--+--------------+-------------------------------+-------+---------+-----+
200+
|21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 |
201+
+--+--------------+-------------------------------+-------+---------+-----+
202+
|22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 |
203+
+--+--------------+-------------------------------+-------+---------+-----+
204+
|23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 |
205+
+--+--------------+-------------------------------+-------+---------+-----+
206+
|24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 |
207+
+--+--------------+-------------------------------+-------+---------+-----+
208+
|25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 |
209+
+--+--------------+-------------------------------+-------+---------+-----+
210+
|26|protein_homo | KDD CUP 2004, minority | 11:1 | 145,751 | 74 |
211+
+--+--------------+-------------------------------+-------+---------+-----+
212+
|27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 |
213+
+--+--------------+-------------------------------+-------+---------+-----+
214+
215+
References
216+
----------
217+
.. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly
218+
Imbalanced Data Learning and their Application in Bioinformatics."
219+
Dissertation, Georgia State University, (2011).
220+
"""
221+
222+
data_home = get_data_home(data_home=data_home)
223+
zenodo_dir = join(data_home, "zenodo")
224+
datasets = OrderedDict()
225+
226+
if filter_data is None:
227+
filter_data_ = MAP_NAME_ID.keys()
228+
else:
229+
list_data = MAP_NAME_ID.keys()
230+
filter_data_ = []
231+
for it in filter_data:
232+
if isinstance(it, six.string_types):
233+
if it not in list_data:
234+
raise ValueError('{} is not a dataset available. '
235+
'The available datasets are {}'.format(
236+
it, list_data))
237+
else:
238+
filter_data_.append(it)
239+
elif isinstance(it, int):
240+
if it < 1 or it > 27:
241+
raise ValueError('The dataset with the ID={} is not an '
242+
'available dataset. The IDs are '
243+
'{}'.format(it, range(1, 28)))
244+
else:
245+
# The index start at one, then we need to remove one
246+
# to not have issue with the indexing.
247+
filter_data_.append(MAP_ID_NAME[it])
248+
else:
249+
raise ValueError('The value in the tuple should be str or int.'
250+
' Got {} instead.'.format(type(it)))
251+
252+
# go through the list and check if the data are available
253+
for it in filter_data_:
254+
filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME
255+
filename = join(zenodo_dir, filename)
256+
available = isfile(filename)
257+
258+
if download_if_missing and not available:
259+
makedirs(zenodo_dir, exist_ok=True)
260+
logger.warning("Downloading %s" % URL)
261+
f = BytesIO(urlopen(URL).read())
262+
tar = tarfile.open(fileobj=f)
263+
tar.extractall(path=zenodo_dir)
264+
elif not download_if_missing and not available:
265+
raise IOError("Data not found and `download_if_missing` is False")
266+
267+
data = np.load(filename)
268+
X, y = data['data'], data['label']
269+
270+
if shuffle:
271+
ind = np.arange(X.shape[0])
272+
rng = check_random_state(random_state)
273+
rng.shuffle(ind)
274+
X = X[ind]
275+
y = y[ind]
276+
277+
datasets[it] = Bunch(data=X, target=y, DESCR=it)
278+
279+
return datasets

0 commit comments

Comments
 (0)