Skip to content

Weird IndexError /segfault for df.groupby(level="levelname") if na in level #2616

Closed
@gerigk

Description

@gerigk

I know that there are na issues for indices but I have never encountered this behavior before (pandas master):
(fully functional code depending on the requests and xlrd library and a berlin open data set)

import pandas as pd
import requests
the_data = requests.get('http://www.berlin.de/imperia/md/content/senatsverwaltungen/finanzen/haushalt/ansatzn2013.xls?download.html',
    stream=True)
xl_file = pd.io.parsers.ExcelFile(the_data.raw)  # only works with xlrd installed
df = xl_file.parse('Ansatz 2013')
original_data = df.copy()
df.columns = ['bereich', 'einzelplan', 'kapitel', 'titelart', 'titel', 'titelbezeichnung', 'funktion', 'betrag_tausend']
df['bereich_id'] = df.bereich.str.slice(start=1, stop=3).astype(int)
df['bereich'] = df.bereich.str.slice(start=5)
df['einzelplan_id'] = df.einzelplan.str.slice(start=1, stop=3).astype(int)
df['einzelplan'] = df.einzelplan.str.slice(start=5)
df['kapitel_id'] = df.kapitel.str.slice(start=1, stop=5).astype(int)
df['kapitel'] = df.kapitel.str.slice(start=7)
df['funktion_id'] = df.funktion.str.slice(start=1, stop=4).astype(float) # one missing value
df['funktion'] = df.funktion.str.slice(start=6)
df['betrag'] = df.betrag_tausend * 1000
del df['betrag_tausend']
df = df[['bereich', 'bereich_id', 'einzelplan', 'einzelplan_id',
    'kapitel', 'kapitel_id', 'funktion', 'funktion_id',
    'titel', 'titelbezeichnung', 'titelart', 'betrag']]
df.set_index(['bereich', 'bereich_id', 'einzelplan', 'einzelplan_id',
    'kapitel', 'kapitel_id', 'funktion', 'funktion_id',
    'titel', 'titelbezeichnung', 'titelart'], inplace=True)

now calling

df.groupby(level=['bereich']).sum()

works nicely whereas

df.groupby(level=['funktion_id']).sum()

results in

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-3-d8991eeaebb9> in <module>()
----> 1 df.groupby(level=['funktion_id']).sum()

/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/generic.pyc in groupby(self, by, axis, level, as_index, sort, group_keys)
    132         from pandas.core.groupby import groupby
    133         return groupby(self, by, axis=axis, level=level, as_index=as_index,
--> 134                        sort=sort, group_keys=group_keys)
    135 
    136     def asfreq(self, freq, method=None, how=None, normalize=False):

/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in groupby(obj, by, **kwds)
    498         raise TypeError('invalid type: %s' % type(obj))
    499 
--> 500     return klass(obj, by, **kwds)
    501 
    502 

/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys)
    189         if grouper is None:
    190             grouper, exclusions = _get_grouper(obj, keys, axis=axis,
--> 191                                                level=level, sort=sort)
    192 
    193         self.grouper = grouper

/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in _get_grouper(obj, key, axis, level, sort)
   1244             name = gpr
   1245             gpr = obj[gpr]
-> 1246         ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort)
   1247         groupings.append(ping)
   1248 

/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/groupby.pyc in __init__(self, index, grouper, name, level, sort)
   1115                 self._labels = labels
   1116                 self._group_index = level_index
-> 1117                 self.grouper = level_index.take(labels)
   1118         else:
   1119             if isinstance(self.grouper, (list, tuple)):

/usr/local/lib/python2.7/dist-packages/pandas-0.10.1.dev_dcd9df7-py2.7-linux-x86_64.egg/pandas/core/index.pyc in take(self, indexer, axis)
    415         """
    416         indexer = com._ensure_platform_int(indexer)
--> 417         taken = self.view(np.ndarray).take(indexer)
    418         return self._constructor(taken, name=self.name)
    419 

IndexError: index 138 is out of bounds for axis 0 with size 138

and

df.groupby(level=['funktion']).sum()

segfaults

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions