Description
When performing a groupby.apply on a dataframe with a float index, I receive a KeyError
, depending on whether or not the index has the same ordering as the column I am grouping on.
Code Sample
broken
import pandas as pd
df = pd.DataFrame({"col": [1, 2, 3,]}, index=[0.1, 0.3, 0.2,])
df.groupby("col").apply(lambda x: x)
working (mind the sorting of col vs index)
import pandas as pd
df = pd.DataFrame({"col": [1, 2, 3,]}, index=[0.1, 0.2, 0.3,])
df.groupby("col").apply(lambda x: x)
import pandas as pd
df = pd.DataFrame({"col": [1, 4, 3,]}, index=[0.1, 0.4, 0.2,])
df.groupby("col").apply(lambda x: x)
Example traceback
KeyError Traceback (most recent call last)
<ipython-input-14-fac5566be46e> in <module>
----> 1 df.groupby("col").apply(lambda x: x.copy())
~/workspace/pandas/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
854 with option_context("mode.chained_assignment", None):
855 try:
--> 856 result = self._python_apply_general(f, self._selected_obj)
857 except TypeError:
858 # gh-20949
~/workspace/pandas/pandas/core/groupby/groupby.py in _python_apply_general(self, f, data)
887 data after applying f
888 """
--> 889 keys, values, mutated = self.grouper.apply(f, data, self.axis)
890
891 return self._wrap_applied_output(
~/workspace/pandas/pandas/core/groupby/ops.py in apply(self, f, data, axis)
176 ):
177 try:
--> 178 result_values, mutated = splitter.fast_apply(f, sdata, group_keys)
179
180 except libreduction.InvalidApply as err:
~/workspace/pandas/pandas/core/groupby/ops.py in fast_apply(self, f, sdata, names)
962 # must return keys::list, values::list, mutated::bool
963 starts, ends = lib.generate_slices(self.slabels, self.ngroups)
--> 964 return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
965
966 def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
~/workspace/pandas/pandas/_libs/reduction.pyx in pandas._libs.reduction.apply_frame_axis0()
482 results = []
483
--> 484 slider = BlockSlider(frame)
485
486 mutated = False
~/workspace/pandas/pandas/_libs/reduction.pyx in pandas._libs.reduction.BlockSlider.__init__()
549
550 self.frame = frame
--> 551 self.dummy = frame[:0]
552 self.index = self.dummy.index
553
~/workspace/pandas/pandas/core/frame.py in __getitem__(self, key)
2810
2811 # Do we have a slicer (on rows)?
-> 2812 indexer = convert_to_index_sliceable(self, key)
2813 if indexer is not None:
2814 # either we have a slice or we have a string that can be converted
~/workspace/pandas/pandas/core/indexing.py in convert_to_index_sliceable(obj, key)
2113 idx = obj.index
2114 if isinstance(key, slice):
-> 2115 return idx._convert_slice_indexer(key, kind="getitem")
2116
2117 elif isinstance(key, str):
~/workspace/pandas/pandas/core/indexes/numeric.py in _convert_slice_indexer(self, key, kind)
382 # We always treat __getitem__ slicing as label-based
383 # translate to locations
--> 384 return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
385
386 # ----------------------------------------------------------------
~/workspace/pandas/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
4940 slice(1, 3, None)
4941 """
-> 4942 start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
4943
4944 # return a slice
~/workspace/pandas/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
5147 end_slice = None
5148 if end is not None:
-> 5149 end_slice = self.get_slice_bound(end, "right", kind)
5150 if end_slice is None:
5151 end_slice = len(self)
~/workspace/pandas/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
5063 except ValueError:
5064 # raise the original KeyError
-> 5065 raise err
5066
5067 if isinstance(slc, np.ndarray):
~/workspace/pandas/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
5057 # we need to look up the label
5058 try:
-> 5059 slc = self.get_loc(label)
5060 except KeyError as err:
5061 try:
~/workspace/pandas/pandas/core/indexes/numeric.py in get_loc(self, key, method, tolerance)
444 return nan_idxs
445
--> 446 return super().get_loc(key, method=method, tolerance=tolerance)
447
448 @cache_readonly
~/workspace/pandas/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2876 return self._engine.get_loc(casted_key)
2877 except KeyError as err:
-> 2878 raise KeyError(key) from err
2879
2880 if tolerance is not None:
KeyError: 0
Output of pd.show_versions()
INSTALLED VERSIONS
commit : 043b609
python : 3.8.2.final.0
python-bits : 64
OS : Darwin
OS-release : 18.7.0
Version : Darwin Kernel Version 18.7.0: Mon Feb 10 21:08:45 PST 2020; root:xnu-4903.278.28~1/RELEASE_X86_64
machine : x86_64
processor : i386
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.UTF-8
pandas : 1.1.0.dev0+1708.g043b60920
numpy : 1.18.4
pytz : 2020.1
dateutil : 2.8.1
pip : 20.1.1
setuptools : 47.1.0.post20200528
Cython : 0.29.19
pytest : 5.4.2
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.2
IPython : 7.14.0
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
matplotlib : 3.2.1
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : 1.3.0
numba : None
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas (043b609)