Closed
Description
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
(optional) I have confirmed this bug exists on the master branch of pandas.
pandas.cut
fails for a pandas.Series
with non-unique index as input.
In [1]: import pandas as pd
...:
...: s = pd.Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
...:
...: cut = pd.cut(s, [0, 2, 4])
...: cut
Out[1]:
0 NaN
1 (0.0, 2.0]
2 (0.0, 2.0]
3 (2.0, 4.0]
0 NaN
dtype: category
Categories (2, interval[int64]): [(0, 2] < (2, 4]]
In [2]: cut = pd.cut(s, [0, 2, 4], include_lowest=True)
...: cut
/home/mlondschien/miniforge3/envs/quantcore.thek/lib/python3.9/site-packages/pandas/core/reshape/tile.py:410: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted a
s an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
ids[x == bins[0]] = 1
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/miniforge3/envs/quantcore.thek/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine._get_loc_duplicates()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._maybe_get_bool_indexer()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine._unpack_bool_indexer()
KeyError: 4
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-5-00a66777dca1> in <module>
----> 1 cut = pd.cut(s, [0, 2, 4], include_lowest=True)
2 cut
~/miniforge3/envs/quantcore.thek/lib/python3.9/site-packages/pandas/core/reshape/tile.py in cut(x, bins, right, labels, retbins, precision, include_lowest, duplicates, ordered)
271 raise ValueError("bins must increase monotonically.")
272
--> 273 fac, bins = _bins_to_cuts(
274 x,
275 bins,
~/miniforge3/envs/quantcore.thek/lib/python3.9/site-packages/pandas/core/reshape/tile.py in _bins_to_cuts(x, bins, right, labels, precision, include_lowest, dtype, duplicates, ordered)
408
409 if include_lowest:
--> 410 ids[x == bins[0]] = 1
411
412 na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
~/miniforge3/envs/quantcore.thek/lib/python3.9/site-packages/pandas/core/series.py in __getitem__(self, key)
851
852 elif key_is_scalar:
--> 853 return self._get_value(key)
854
855 if is_hashable(key):
~/miniforge3/envs/quantcore.thek/lib/python3.9/site-packages/pandas/core/series.py in _get_value(self, label, takeable)
959
960 # Similar to Index.get_value, but we do not fall back to positional
--> 961 loc = self.index.get_loc(label)
962 return self.index._get_values_for_loc(self, loc, label)
963
~/miniforge3/envs/quantcore.thek/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
-> 3082 raise KeyError(key) from err
3083
3084 if tolerance is not None:
KeyError: 4
In [3]: cut = pd.cut(s.to_numpy(), [0, 2, 4], include_lowest=True)
...: cut
Out[3]:
[(-0.001, 2.0], (-0.001, 2.0], (-0.001, 2.0], (2.0, 4.0], (-0.001, 2.0]]
Categories (2, interval[float64]): [(-0.001, 2.0] < (2.0, 4.0]]
import pandas as pd
s = pd.Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
cut = pd.cut(s, [0, 2, 4])
cut
cut = pd.cut(s, [0, 2, 4], include_lowest=True)
cut = pd.cut(s.to_numpy(), [0, 2, 4], include_lowest=True)
cut
In [8]: pd.show_versions()
INSTALLED VERSIONS
------------------
commit : 2cb96529396d93b46abab7bbc73a208e708c642e
python : 3.9.2.final.0
python-bits : 64
OS : Linux
OS-release : 5.4.0-74-generic
Version : #83-Ubuntu SMP Sat May 8 02:35:39 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.2.4
numpy : 1.19.5
pytz : 2021.1
dateutil : 2.8.1
pip : 21.1.1
setuptools : 49.6.0.post20210108
Cython : None
pytest : 6.2.3
hypothesis : None
sphinx : 3.5.4
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.11.3
IPython : 7.23.0
pandas_datareader: None
bs4 : None
bottleneck : None
fsspec : 2021.04.0
fastparquet : None
gcsfs : None
matplotlib : 3.4.2
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 3.0.0
pyxlsb : None
s3fs : None
scipy : 1.6.3
sqlalchemy : None
tables : None
tabulate : 0.8.9
xarray : None
xlrd : None
xlwt : None
numba : None