Closed
Description
I know that there are na issues for indices but I have never encountered this behavior before (pandas master):
(fully functional code depending on the requests and xlrd library and a berlin open data set)
import pandas as pd
import requests
the_data = requests.get('http://www.berlin.de/imperia/md/content/senatsverwaltungen/finanzen/haushalt/ansatzn2013.xls?download.html',
stream=True)
xl_file = pd.io.parsers.ExcelFile(the_data.raw) # only works with xlrd installed
df = xl_file.parse('Ansatz 2013')
original_data = df.copy()
df.columns = ['bereich', 'einzelplan', 'kapitel', 'titelart', 'titel', 'titelbezeichnung', 'funktion', 'betrag_tausend']
df['bereich_id'] = df.bereich.str.slice(start=1, stop=3).astype(int)
df['bereich'] = df.bereich.str.slice(start=5)
df['einzelplan_id'] = df.einzelplan.str.slice(start=1, stop=3).astype(int)
df['einzelplan'] = df.einzelplan.str.slice(start=5)
df['kapitel_id'] = df.kapitel.str.slice(start=1, stop=5).astype(int)
df['kapitel'] = df.kapitel.str.slice(start=7)
df['funktion_id'] = df.funktion.str.slice(start=1, stop=4).astype(float) # one missing value
df['funktion'] = df.funktion.str.slice(start=6)
df['betrag'] = df.betrag_tausend * 1000
del df['betrag_tausend']
df = df[['bereich', 'bereich_id', 'einzelplan', 'einzelplan_id',
'kapitel', 'kapitel_id', 'funktion', 'funktion_id',
'titel', 'titelbezeichnung', 'titelart', 'betrag']]
df.set_index(['bereich', 'bereich_id', 'einzelplan', 'einzelplan_id',
'kapitel', 'kapitel_id', 'funktion', 'funktion_id',
'titel', 'titelbezeichnung', 'titelart'], inplace=True)
now calling
df.groupby(level=['bereich']).sum()
works nicely whereas
df.groupby(level=['funktion_id']).sum()
results in
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-3-d8991eeaebb9> in <module>()
----> 1 df.groupby(level=['funktion_id']).sum()
/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/generic.pyc in groupby(self, by, axis, level, as_index, sort, group_keys)
132 from pandas.core.groupby import groupby
133 return groupby(self, by, axis=axis, level=level, as_index=as_index,
--> 134 sort=sort, group_keys=group_keys)
135
136 def asfreq(self, freq, method=None, how=None, normalize=False):
/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in groupby(obj, by, **kwds)
498 raise TypeError('invalid type: %s' % type(obj))
499
--> 500 return klass(obj, by, **kwds)
501
502
/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys)
189 if grouper is None:
190 grouper, exclusions = _get_grouper(obj, keys, axis=axis,
--> 191 level=level, sort=sort)
192
193 self.grouper = grouper
/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in _get_grouper(obj, key, axis, level, sort)
1244 name = gpr
1245 gpr = obj[gpr]
-> 1246 ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort)
1247 groupings.append(ping)
1248
/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in __init__(self, index, grouper, name, level, sort)
1115 self._labels = labels
1116 self._group_index = level_index
-> 1117 self.grouper = level_index.take(labels)
1118 else:
1119 if isinstance(self.grouper, (list, tuple)):
/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/index.pyc in take(self, indexer, axis)
415 """
416 indexer = com._ensure_platform_int(indexer)
--> 417 taken = self.view(np.ndarray).take(indexer)
418 return self._constructor(taken, name=self.name)
419
IndexError: index 138 is out of bounds for axis 0 with size 138
and
df.groupby(level=['funktion']).sum()
segfaults