Skip to content

BUG: Groupby.apply raises KeyError for Float64Index depending on order of index and column #34455

Closed
@fjetter

Description

@fjetter

When performing a groupby.apply on a dataframe with a float index, I receive a KeyError, depending on whether or not the index has the same ordering as the column I am grouping on.


Code Sample

broken

import pandas as pd

df = pd.DataFrame({"col": [1, 2, 3,]}, index=[0.1, 0.3, 0.2,])
df.groupby("col").apply(lambda x: x)

working (mind the sorting of col vs index)

import pandas as pd

df = pd.DataFrame({"col": [1, 2, 3,]}, index=[0.1, 0.2, 0.3,])
df.groupby("col").apply(lambda x: x)
import pandas as pd

df = pd.DataFrame({"col": [1, 4, 3,]}, index=[0.1, 0.4, 0.2,])
df.groupby("col").apply(lambda x: x)

Example traceback

KeyError                                  Traceback (most recent call last)
<ipython-input-14-fac5566be46e> in <module>
----> 1 df.groupby("col").apply(lambda x: x.copy())

~/workspace/pandas/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    854         with option_context("mode.chained_assignment", None):
    855             try:
--> 856                 result = self._python_apply_general(f, self._selected_obj)
    857             except TypeError:
    858                 # gh-20949

~/workspace/pandas/pandas/core/groupby/groupby.py in _python_apply_general(self, f, data)
    887             data after applying f
    888         """
--> 889         keys, values, mutated = self.grouper.apply(f, data, self.axis)
    890
    891         return self._wrap_applied_output(

~/workspace/pandas/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    176         ):
    177             try:
--> 178                 result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
    179
    180             except libreduction.InvalidApply as err:

~/workspace/pandas/pandas/core/groupby/ops.py in fast_apply(self, f, sdata, names)
    962         # must return keys::list, values::list, mutated::bool
    963         starts, ends = lib.generate_slices(self.slabels, self.ngroups)
--> 964         return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
    965
    966     def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:

~/workspace/pandas/pandas/_libs/reduction.pyx in pandas._libs.reduction.apply_frame_axis0()
    482     results = []
    483
--> 484     slider = BlockSlider(frame)
    485
    486     mutated = False

~/workspace/pandas/pandas/_libs/reduction.pyx in pandas._libs.reduction.BlockSlider.__init__()
    549
    550         self.frame = frame
--> 551         self.dummy = frame[:0]
    552         self.index = self.dummy.index
    553

~/workspace/pandas/pandas/core/frame.py in __getitem__(self, key)
   2810
   2811         # Do we have a slicer (on rows)?
-> 2812         indexer = convert_to_index_sliceable(self, key)
   2813         if indexer is not None:
   2814             # either we have a slice or we have a string that can be converted

~/workspace/pandas/pandas/core/indexing.py in convert_to_index_sliceable(obj, key)
   2113     idx = obj.index
   2114     if isinstance(key, slice):
-> 2115         return idx._convert_slice_indexer(key, kind="getitem")
   2116
   2117     elif isinstance(key, str):

~/workspace/pandas/pandas/core/indexes/numeric.py in _convert_slice_indexer(self, key, kind)
    382         # We always treat __getitem__ slicing as label-based
    383         # translate to locations
--> 384         return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
    385
    386     # ----------------------------------------------------------------

~/workspace/pandas/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
   4940         slice(1, 3, None)
   4941         """
-> 4942         start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
   4943
   4944         # return a slice

~/workspace/pandas/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
   5147         end_slice = None
   5148         if end is not None:
-> 5149             end_slice = self.get_slice_bound(end, "right", kind)
   5150         if end_slice is None:
   5151             end_slice = len(self)

~/workspace/pandas/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
   5063             except ValueError:
   5064                 # raise the original KeyError
-> 5065                 raise err
   5066
   5067         if isinstance(slc, np.ndarray):

~/workspace/pandas/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
   5057         # we need to look up the label
   5058         try:
-> 5059             slc = self.get_loc(label)
   5060         except KeyError as err:
   5061             try:

~/workspace/pandas/pandas/core/indexes/numeric.py in get_loc(self, key, method, tolerance)
    444             return nan_idxs
    445
--> 446         return super().get_loc(key, method=method, tolerance=tolerance)
    447
    448     @cache_readonly

~/workspace/pandas/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2876                 return self._engine.get_loc(casted_key)
   2877             except KeyError as err:
-> 2878                 raise KeyError(key) from err
   2879
   2880         if tolerance is not None:

KeyError: 0

Output of pd.show_versions()

INSTALLED VERSIONS

commit : 043b609
python : 3.8.2.final.0
python-bits : 64
OS : Darwin
OS-release : 18.7.0
Version : Darwin Kernel Version 18.7.0: Mon Feb 10 21:08:45 PST 2020; root:xnu-4903.278.28~1/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.UTF-8

pandas : 1.1.0.dev0+1708.g043b60920
numpy : 1.18.4
pytz : 2020.1
dateutil : 2.8.1
pip : 20.1.1
setuptools : 47.1.0.post20200528
Cython : 0.29.19
pytest : 5.4.2
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.14.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
matplotlib : 3.2.1
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : 1.3.0
numba : None

  • I have checked that this issue has not already been reported.

  • I have confirmed this bug exists on the latest version of pandas.

  • (optional) I have confirmed this bug exists on the master branch of pandas (043b609)

Metadata

Metadata

Assignees

Labels

ApplyApply, Aggregate, Transform, MapBugGroupby

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions