Description
Code Sample
import pandas as pd
df=pd.DataFrame([['x','p'],['x','p'],['x','q']], columns=['X','Y'], index=[1,2,2])
print(df)
df=df.groupby(['Y']).apply(lambda x: x)
df=pd.DataFrame([['x','p'],['x','p'],['x','o']], columns=['X','Y'], index=[1,2,2])
print(df)
df=df.groupby(['Y']).apply(lambda x: x)
Problem description
On dataframe:
X Y
1 x p
2 x p
2 x q
groupby.apply
on column Y
works.
but throws duplicate axis exception with:
X Y
1 x p
2 x p
2 x o
the exception does not happen if the dataframe was sorted on Y
Traceback (most recent call last):
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 725, in apply
result = self._python_apply_general(f)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 745, in _python_apply_general
keys, values, not_indexed_same=mutated or self.mutated
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/generic.py", line 372, in _wrap_applied_output
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 955, in _concat_objects
result = result.reindex(ax, axis=self.axis)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/util/_decorators.py", line 221, in wrapper
return func(*args, **kwargs)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/frame.py", line 3976, in reindex
return super().reindex(**kwargs)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/generic.py", line 4514, in reindex
axes, level, limit, tolerance, method, fill_value, copy
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/frame.py", line 3864, in _reindex_axes
index, method, copy, level, fill_value, limit, tolerance
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/frame.py", line 3886, in _reindex_index
allow_dups=False,
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/generic.py", line 4577, in _reindex_with_indexers
copy=copy,
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1251, in reindex_indexer
self.axes[axis]._can_reindex(indexer)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 3362, in _can_reindex
raise ValueError("cannot reindex from a duplicate axis")
ValueError: cannot reindex from a duplicate axis
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/hm106930/ipv/web/groupby_bug.py", line 16, in <module>
df=df.groupby(['Y']).apply(lambda x: x)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 737, in apply
return self._python_apply_general(f)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 745, in _python_apply_general
keys, values, not_indexed_same=mutated or self.mutated
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/generic.py", line 372, in _wrap_applied_output
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/groupby/groupby.py", line 955, in _concat_objects
result = result.reindex(ax, axis=self.axis)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/util/_decorators.py", line 221, in wrapper
return func(*args, **kwargs)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/frame.py", line 3976, in reindex
return super().reindex(**kwargs)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/generic.py", line 4514, in reindex
axes, level, limit, tolerance, method, fill_value, copy
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/frame.py", line 3864, in _reindex_axes
index, method, copy, level, fill_value, limit, tolerance
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/frame.py", line 3886, in _reindex_index
allow_dups=False,
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/generic.py", line 4577, in _reindex_with_indexers
copy=copy,
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1251, in reindex_indexer
self.axes[axis]._can_reindex(indexer)
File "/home/hm106930/.local/share/virtualenvs/ipv-work-62kMEXht/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 3362, in _can_reindex
raise ValueError("cannot reindex from a duplicate axis")
ValueError: cannot reindex from a duplicate axis
Expected Output
X Y
1 x p
2 x p
2 x o
Output of pd.show_versions()
INSTALLED VERSIONS
commit : None
python : 3.6.3.final.0
python-bits : 64
OS : Linux
OS-release : 3.10.0-957.27.2.el7.x86_64
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 0.25.3
numpy : 1.16.4
pytz : 2019.1
dateutil : 2.8.0
pip : 19.3.1
setuptools : 41.0.1
Cython : 0.29.13
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.3.3
html5lib : 1.0.1
pymysql : None
psycopg2 : 2.8.3 (dt dec pq3 ext lo64)
jinja2 : 2.10.1
IPython : 7.5.0
pandas_datareader: None
bs4 : 4.8.0
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : 4.3.3
matplotlib : 3.1.1
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 0.13.0
pytables : None
s3fs : None
scipy : None
sqlalchemy : 1.3.7
tables : None
xarray : None
xlrd : 1.2.0
xlwt : None
xlsxwriter : None