Skip to content

BUG: Series.groupby fails when grouping on MultiIndex with nulls in first level #47348

Closed
@charlesbluca

Description

@charlesbluca

Pandas version checks

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • I have confirmed this bug exists on the main branch of pandas.

Reproducible Example

import pandas as pd

arr = [list(range(4)) * 4, list(range(2)) * 8]

# insert nulls into first level of MultiIndex
arr[0][2] = None

ser = pd.Series(
    list(range(16)),
    index=pd.MultiIndex.from_arrays(arr, names=("a", "b"))
)

ser.groupby(level=[0, 1])

Issue Description

When attempting a groupby on all levels of a series with a multi-index with nulls in the first level, I get the following traceback:

TypeError                                 Traceback (most recent call last)
Input In [1], in <cell line: 13>()
      6 arr[1][4] = None
      8 ser = pd.Series(
      9     list(range(16)),
     10     index=pd.MultiIndex.from_arrays(arr, names=("a", "b"))
     11 )
---> 13 ser.groupby(level=[0, 1])

File /datasets/charlesb/dev/pandas/pandas/pandas/core/series.py:1956, in Series.groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, dropna)
   1953     raise TypeError("You have to supply one of 'by' and 'level'")
   1954 axis = self._get_axis_number(axis)
-> 1956 return SeriesGroupBy(
   1957     obj=self,
   1958     keys=by,
   1959     axis=axis,
   1960     level=level,
   1961     as_index=as_index,
   1962     sort=sort,
   1963     group_keys=group_keys,
   1964     squeeze=squeeze,
   1965     observed=observed,
   1966     dropna=dropna,
   1967 )

File /datasets/charlesb/dev/pandas/pandas/pandas/core/groupby/groupby.py:937, in GroupBy.__init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated, dropna)
    934 if grouper is None:
    935     from pandas.core.groupby.grouper import get_grouper
--> 937     grouper, exclusions, obj = get_grouper(
    938         obj,
    939         keys,
    940         axis=axis,
    941         level=level,
    942         sort=sort,
    943         observed=observed,
    944         mutated=self.mutated,
    945         dropna=self.dropna,
    946     )
    948 self.obj = obj
    949 self.axis = obj._get_axis_number(axis)

File /datasets/charlesb/dev/pandas/pandas/pandas/core/groupby/grouper.py:892, in get_grouper(obj, key, axis, level, sort, observed, mutated, validate, dropna)
    887         in_axis = False
    889     # create the Grouping
    890     # allow us to passing the actual Grouping as the gpr
    891     ping = (
--> 892         Grouping(
    893             group_axis,
    894             gpr,
    895             obj=obj,
    896             level=level,
    897             sort=sort,
    898             observed=observed,
    899             in_axis=in_axis,
    900             dropna=dropna,
    901         )
    902         if not isinstance(gpr, Grouping)
    903         else gpr
    904     )
    906     groupings.append(ping)
    908 if len(groupings) == 0 and len(obj):

File /datasets/charlesb/dev/pandas/pandas/pandas/core/groupby/grouper.py:504, in Grouping.__init__(self, index, grouper, obj, level, sort, observed, in_axis, dropna)
    496     mapper = self.grouping_vector
    497     # In extant tests, the new self.grouping_vector matches
    498     #  `index.get_level_values(ilevel)` whenever
    499     #  mapper is None and isinstance(index, MultiIndex)
    500     (
    501         self.grouping_vector,  # Index
    502         self._codes,
    503         self._group_index,
--> 504     ) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna)
    506 # a passed Grouper like, directly get the grouper in the same way
    507 # as single grouper groupby, use the group_info to get codes
    508 elif isinstance(self.grouping_vector, Grouper):
    509     # get the new grouper; we already have disambiguated
    510     # what key/level refer to exactly, don't need to
    511     # check again as we have by this point converted these
    512     # to an actual value (rather than a pd.Grouper)

File /datasets/charlesb/dev/pandas/pandas/pandas/core/indexes/multi.py:1516, in MultiIndex._get_grouper_for_level(self, mapper, level, dropna)
   1514     # Handle group mapping function and return
   1515     level_values = self.levels[level].take(indexer)
-> 1516     grouper = level_values.map(mapper)
   1517     return grouper, None, None
   1519 values = self.get_level_values(level)

File /datasets/charlesb/dev/pandas/pandas/pandas/core/indexes/base.py:6369, in Index.map(self, mapper, na_action)
   6349 """
   6350 Map values using an input mapping or function.
   6351 
   (...)
   6365     a MultiIndex will be returned.
   6366 """
   6367 from pandas.core.indexes.multi import MultiIndex
-> 6369 new_values = self._map_values(mapper, na_action=na_action)
   6371 # we can return a MultiIndex
   6372 if new_values.size and isinstance(new_values[0], tuple):

File /datasets/charlesb/dev/pandas/pandas/pandas/core/base.py:882, in IndexOpsMixin._map_values(self, mapper, na_action)
    879         raise ValueError(msg)
    881 # mapper is a function
--> 882 new_values = map_f(values, mapper)
    884 return new_values

File /datasets/charlesb/dev/pandas/pandas/pandas/_libs/lib.pyx:2898, in pandas._libs.lib.map_infer()
   2896     result[i] = arr[i]
   2897     continue
-> 2898 val = f(arr[i])
   2899 
   2900 if cnp.PyArray_IsZeroDim(val):

TypeError: 'numpy.ndarray' object is not callable

Expected Behavior

I would expect to successfully get back a groupby object; this behavior occurs if we instead have nulls in the second level of the multi-index:

import pandas as pd

arr = [list(range(4)) * 4, list(range(2)) * 8]

# insert nulls into second level of MultiIndex
arr[1][2] = None

ser = pd.Series(
    list(range(16)),
    index=pd.MultiIndex.from_arrays(arr, names=("a", "b"))
)

ser.groupby(level=[0, 1])
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f88c4192730>

Installed Versions

INSTALLED VERSIONS

commit : 696e9bd
python : 3.8.13.final.0
python-bits : 64
OS : Linux
OS-release : 4.15.0-1083-oracle
Version : #91-Ubuntu SMP Mon Oct 25 06:45:22 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8

pandas : 1.5.0.dev0+953.g696e9bd04f
numpy : 1.22.4
pytz : 2022.1
dateutil : 2.8.2
setuptools : 62.3.4
pip : 22.1.2
Cython : 0.29.30
pytest : 7.1.2
hypothesis : 6.47.1
sphinx : 4.5.0
blosc : None
feather : None
xlsxwriter : 3.0.3
lxml.etree : 4.9.0
html5lib : 1.1
pymysql : None
psycopg2 : None
jinja2 : 3.0.3
IPython : 8.4.0
pandas_datareader: None
bs4 : 4.11.1
bottleneck : 1.3.4
brotli :
fastparquet : 0.8.1
fsspec : 2021.11.0
gcsfs : 2021.11.0
matplotlib : 3.5.2
numba : 0.53.1
numexpr : 2.8.0
odfpy : None
openpyxl : 3.0.9
pandas_gbq : None
pyarrow : 8.0.0
pyreadstat : 1.1.7
pyxlsb : None
s3fs : 2021.11.0
scipy : 1.8.1
snappy :
sqlalchemy : 1.4.37
tables : 3.7.0
tabulate : 0.8.9
xarray : 2022.3.0
xlrd : 2.0.1
xlwt : 1.3.0
zstandard : None

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions