|
| 1 | +"""Collection of imbalanced datasets. |
| 2 | +
|
| 3 | +This collection of datasets has been proposed in [1]_. The |
| 4 | +characteristics of the available datasets are presented in the table |
| 5 | +below. |
| 6 | +
|
| 7 | + ID Name Repository & Target Ratio #S #F |
| 8 | + 1 ecoli UCI, target: imU 8.6:1 336 7 |
| 9 | + 2 optical_digits UCI, target: 8 9.1:1 5,620 64 |
| 10 | + 3 satimage UCI, target: 4 9.3:1 6,435 36 |
| 11 | + 4 pen_digits UCI, target: 5 9.4:1 10,992 16 |
| 12 | + 5 abalone UCI, target: 7 9.7:1 4,177 10 |
| 13 | + 6 sick_euthyroid UCI, target: sick euthyroid 9.8:1 3,163 42 |
| 14 | + 7 spectrometer UCI, target: >=44 11:1 531 93 |
| 15 | + 8 car_eval_34 UCI, target: good, v good 12:1 1,728 21 |
| 16 | + 9 isolet UCI, target: A, B 12:1 7,797 617 |
| 17 | + 10 us_crime UCI, target: >0.65 12:1 1,994 100 |
| 18 | + 11 yeast_ml8 LIBSVM, target: 8 13:1 2,417 103 |
| 19 | + 12 scene LIBSVM, target: >one label 13:1 2,407 294 |
| 20 | + 13 libras_move UCI, target: 1 14:1 360 90 |
| 21 | + 14 thyroid_sick UCI, target: sick 15:1 3,772 52 |
| 22 | + 15 coil_2000 KDD, CoIL, target: minority 16:1 9,822 85 |
| 23 | + 16 arrhythmia UCI, target: 06 17:1 452 278 |
| 24 | + 17 solar_flare_m0 UCI, target: M->0 19:1 1,389 32 |
| 25 | + 18 oil UCI, target: minority 22:1 937 49 |
| 26 | + 19 car_eval_4 UCI, target: vgood 26:1 1,728 21 |
| 27 | + 20 wine_quality UCI, wine, target: <=4 26:1 4,898 11 |
| 28 | + 21 letter_img UCI, target: Z 26:1 20,000 16 |
| 29 | + 22 yeast_me2 UCI, target: ME2 28:1 1,484 8 |
| 30 | + 23 webpage LIBSVM, w7a, target: minority 33:1 34,780 300 |
| 31 | + 24 ozone_level UCI, ozone, data 34:1 2,536 72 |
| 32 | + 25 mammography UCI, target: minority 42:1 11,183 6 |
| 33 | + 26 protein_homo KDD CUP 2004, minority 111:1 145,751 74 |
| 34 | + 27 abalone_19 UCI, target: 19 130:1 4,177 10 |
| 35 | +
|
| 36 | +References |
| 37 | +---------- |
| 38 | +.. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly |
| 39 | + Imbalanced Data Learning and their Application in Bioinformatics." |
| 40 | + Dissertation, Georgia State University, (2011). |
| 41 | +
|
| 42 | +""" |
| 43 | + |
| 44 | +# Author: Guillaume Lemaitre |
| 45 | +# License: BSD 3 clause |
| 46 | + |
| 47 | +from collections import OrderedDict |
| 48 | +import tarfile |
| 49 | +from io import BytesIO |
| 50 | +import logging |
| 51 | +from os.path import join, isfile |
| 52 | +try: |
| 53 | + from urllib2 import urlopen |
| 54 | +except ImportError: |
| 55 | + from urllib.request import urlopen |
| 56 | + |
| 57 | +import numpy as np |
| 58 | + |
| 59 | +from sklearn.datasets import get_data_home |
| 60 | +from sklearn.datasets.base import Bunch |
| 61 | +from sklearn.utils.fixes import makedirs |
| 62 | +from sklearn.externals import six |
| 63 | +from sklearn.utils import check_random_state |
| 64 | + |
| 65 | +URL = ('https://zenodo.org/record/61452/files/' |
| 66 | + 'benchmark-imbalanced-learn.tar.gz') |
| 67 | +PRE_FILENAME = 'x' |
| 68 | +POST_FILENAME = 'data.npz' |
| 69 | + |
| 70 | +MAP_NAME_ID_KEYS = ['ecoli', |
| 71 | + 'optical_digits', |
| 72 | + 'satimage', |
| 73 | + 'pen_digits', |
| 74 | + 'abalone', |
| 75 | + 'sick_euthyroid', |
| 76 | + 'spectrometer', |
| 77 | + 'car_eval_34', |
| 78 | + 'isolet', |
| 79 | + 'us_crime', |
| 80 | + 'yeast_ml8', |
| 81 | + 'scene', |
| 82 | + 'libras_move', |
| 83 | + 'thyroid_sick', |
| 84 | + 'coil_2000', |
| 85 | + 'arrhythmia', |
| 86 | + 'solar_flare_m0', |
| 87 | + 'oil', |
| 88 | + 'car_eval_4', |
| 89 | + 'wine_quality', |
| 90 | + 'letter_img', |
| 91 | + 'yeast_me2', |
| 92 | + 'webpage', |
| 93 | + 'ozone_level', |
| 94 | + 'mammography', |
| 95 | + 'protein_homo', |
| 96 | + 'abalone_19'] |
| 97 | + |
| 98 | +MAP_NAME_ID = OrderedDict() |
| 99 | +MAP_ID_NAME = OrderedDict() |
| 100 | +for v, k in enumerate(MAP_NAME_ID_KEYS): |
| 101 | + MAP_NAME_ID[k] = v + 1 |
| 102 | + MAP_ID_NAME[v + 1] = k |
| 103 | + |
| 104 | +logger = logging.getLogger() |
| 105 | + |
| 106 | + |
| 107 | +def fetch_datasets(data_home=None, |
| 108 | + filter_data=None, |
| 109 | + download_if_missing=True, |
| 110 | + random_state=None, |
| 111 | + shuffle=False): |
| 112 | + """Load the benchmark datasets from Zenodo, downloading it if necessary. |
| 113 | +
|
| 114 | + Parameters |
| 115 | + ---------- |
| 116 | + data_home : string, optional (default=None) |
| 117 | + Specify another download and cache folder for the datasets. By default |
| 118 | + all scikit-learn data is stored in '~/scikit_learn_data' subfolders. |
| 119 | +
|
| 120 | + filter_data : tuple of str/int or None, optional (default=None) |
| 121 | + A tuple containing the ID or the name of the datasets to be returned. |
| 122 | + Refer to the above table to get the ID and name of the datasets. |
| 123 | +
|
| 124 | + download_if_missing : boolean, optional (default=True) |
| 125 | + If False, raise a IOError if the data is not locally available |
| 126 | + instead of trying to download the data from the source site. |
| 127 | +
|
| 128 | + random_state : int, RandomState instance or None, optional (default=None) |
| 129 | + Random state for shuffling the dataset. |
| 130 | + If int, random_state is the seed used by the random number generator; |
| 131 | + If RandomState instance, random_state is the random number generator; |
| 132 | + If None, the random number generator is the RandomState instance used |
| 133 | + by `np.random`. |
| 134 | +
|
| 135 | + shuffle : bool, optional (default=False) |
| 136 | + Whether to shuffle dataset. |
| 137 | +
|
| 138 | + Returns |
| 139 | + ------- |
| 140 | + datasets : OrderedDict of Bunch object, |
| 141 | + The ordered is defined by ``filter_data``. Each Bunch object --- |
| 142 | + refered as dataset --- have the following attributes: |
| 143 | +
|
| 144 | + dataset.data : ndarray, shape (n_samples, n_features) |
| 145 | +
|
| 146 | + dataset.target : ndarray, shape (n_samples, ) |
| 147 | +
|
| 148 | + dataset.DESCR : string |
| 149 | + Description of the each dataset. |
| 150 | +
|
| 151 | + Notes |
| 152 | + ----- |
| 153 | + This collection of datasets have been proposed in [1]_. The |
| 154 | + characteristics of the available datasets are presented in the table |
| 155 | + below. |
| 156 | +
|
| 157 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 158 | + |ID|Name | Repository & Target | Ratio | #S | #F | |
| 159 | + +==+==============+===============================+=======+=========+=====+ |
| 160 | + |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | |
| 161 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 162 | + |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | |
| 163 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 164 | + |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | |
| 165 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 166 | + |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | |
| 167 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 168 | + |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | |
| 169 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 170 | + |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | |
| 171 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 172 | + |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | |
| 173 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 174 | + |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | |
| 175 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 176 | + |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | |
| 177 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 178 | + |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | |
| 179 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 180 | + |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | |
| 181 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 182 | + |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | |
| 183 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 184 | + |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | |
| 185 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 186 | + |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | |
| 187 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 188 | + |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | |
| 189 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 190 | + |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | |
| 191 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 192 | + |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | |
| 193 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 194 | + |18|oil | UCI, target: minority | 22:1 | 937 | 49 | |
| 195 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 196 | + |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | |
| 197 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 198 | + |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | |
| 199 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 200 | + |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | |
| 201 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 202 | + |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | |
| 203 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 204 | + |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | |
| 205 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 206 | + |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | |
| 207 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 208 | + |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | |
| 209 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 210 | + |26|protein_homo | KDD CUP 2004, minority | 11:1 | 145,751 | 74 | |
| 211 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 212 | + |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | |
| 213 | + +--+--------------+-------------------------------+-------+---------+-----+ |
| 214 | +
|
| 215 | + References |
| 216 | + ---------- |
| 217 | + .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly |
| 218 | + Imbalanced Data Learning and their Application in Bioinformatics." |
| 219 | + Dissertation, Georgia State University, (2011). |
| 220 | + """ |
| 221 | + |
| 222 | + data_home = get_data_home(data_home=data_home) |
| 223 | + zenodo_dir = join(data_home, "zenodo") |
| 224 | + datasets = OrderedDict() |
| 225 | + |
| 226 | + if filter_data is None: |
| 227 | + filter_data_ = MAP_NAME_ID.keys() |
| 228 | + else: |
| 229 | + list_data = MAP_NAME_ID.keys() |
| 230 | + filter_data_ = [] |
| 231 | + for it in filter_data: |
| 232 | + if isinstance(it, six.string_types): |
| 233 | + if it not in list_data: |
| 234 | + raise ValueError('{} is not a dataset available. ' |
| 235 | + 'The available datasets are {}'.format( |
| 236 | + it, list_data)) |
| 237 | + else: |
| 238 | + filter_data_.append(it) |
| 239 | + elif isinstance(it, int): |
| 240 | + if it < 1 or it > 27: |
| 241 | + raise ValueError('The dataset with the ID={} is not an ' |
| 242 | + 'available dataset. The IDs are ' |
| 243 | + '{}'.format(it, range(1, 28))) |
| 244 | + else: |
| 245 | + # The index start at one, then we need to remove one |
| 246 | + # to not have issue with the indexing. |
| 247 | + filter_data_.append(MAP_ID_NAME[it]) |
| 248 | + else: |
| 249 | + raise ValueError('The value in the tuple should be str or int.' |
| 250 | + ' Got {} instead.'.format(type(it))) |
| 251 | + |
| 252 | + # go through the list and check if the data are available |
| 253 | + for it in filter_data_: |
| 254 | + filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME |
| 255 | + filename = join(zenodo_dir, filename) |
| 256 | + available = isfile(filename) |
| 257 | + |
| 258 | + if download_if_missing and not available: |
| 259 | + makedirs(zenodo_dir, exist_ok=True) |
| 260 | + logger.warning("Downloading %s" % URL) |
| 261 | + f = BytesIO(urlopen(URL).read()) |
| 262 | + tar = tarfile.open(fileobj=f) |
| 263 | + tar.extractall(path=zenodo_dir) |
| 264 | + elif not download_if_missing and not available: |
| 265 | + raise IOError("Data not found and `download_if_missing` is False") |
| 266 | + |
| 267 | + data = np.load(filename) |
| 268 | + X, y = data['data'], data['label'] |
| 269 | + |
| 270 | + if shuffle: |
| 271 | + ind = np.arange(X.shape[0]) |
| 272 | + rng = check_random_state(random_state) |
| 273 | + rng.shuffle(ind) |
| 274 | + X = X[ind] |
| 275 | + y = y[ind] |
| 276 | + |
| 277 | + datasets[it] = Bunch(data=X, target=y, DESCR=it) |
| 278 | + |
| 279 | + return datasets |
0 commit comments