Description
Code Sample, a copy-pastable example if possible
import pandas as pd
from pandas.api.types import CategoricalDtype
cat_type1 = CategoricalDtype(categories=["a", "b", "c"], ordered=False)
cat_type2 = CategoricalDtype(categories=["c", "b", "a"], ordered=False)
print('Check dtypes are equivalent:', cat_type1 == cat_type2)
# Test Data
df1 = pd.DataFrame({
'foo': pd.Series(["a", "b", "c"]).astype(cat_type1),
'left': [1, 2, 3],
}).set_index("foo")
df2 = pd.DataFrame({
'foo': pd.Series(["c", "b", "a"]).astype(cat_type1),
'right': [3, 2, 1],
}).set_index("foo")
df3 = pd.DataFrame({
'foo': pd.Series(["a", "b", "c"]).astype(cat_type2),
'right': [1, 2, 3],
}).set_index("foo")
df4 = pd.DataFrame({
'foo': pd.Series(["c", "b", "a"]).astype(cat_type2),
'right': [3, 2, 1],
}).set_index("foo")
# this merge succeeds
df1.merge(df2, left_index=True, right_index=True)
# this merge also succeeds
df1.merge(df3, left_index=True, right_index=True)
# this merge fails
df1.merge(df4, left_index=True, right_index=True)
this is the stacktrace
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-8540ed93314b> in <module>()
35
36 # this merge fails
---> 37 df1.merge(df4, left_index=True, right_index=True)
/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
6387 right_on=right_on, left_index=left_index,
6388 right_index=right_index, sort=sort, suffixes=suffixes,
-> 6389 copy=copy, indicator=indicator, validate=validate)
6390
6391 def round(self, decimals=0, *args, **kwargs):
/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
60 copy=copy, indicator=indicator,
61 validate=validate)
---> 62 return op.get_result()
63
64
/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/merge.py in get_result(self)
566 self.left, self.right)
567
--> 568 join_index, left_indexer, right_indexer = self._get_join_info()
569
570 ldata, rdata = self.left._data, self.right._data
/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/merge.py in _get_join_info(self)
763 join_index, left_indexer, right_indexer = \
764 left_ax.join(right_ax, how=self.how, return_indexers=True,
--> 765 sort=self.sort)
766 elif self.right_index and self.how == 'left':
767 join_index, left_indexer, right_indexer = \
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in join(self, other, how, level, return_indexers, sort)
3787 try:
3788 return self._join_monotonic(other, how=how,
-> 3789 return_indexers=return_indexers)
3790 except TypeError:
3791 pass
/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _join_monotonic(self, other, how, return_indexers)
4033 ridx = None
4034 elif how == 'inner':
-> 4035 join_index, lidx, ridx = self._inner_indexer(sv, ov)
4036 join_index = self._wrap_joined_index(join_index, other)
4037 elif how == 'outer':
pandas/_libs/join_helper.pxi in pandas._libs.join.inner_join_indexer_object()
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'signed char'
Problem description
Merging on two equal unordered categorical dtypes fails depending on the order of records in the dataframe.
One work around I found to make it work is to case index to object and then back to categorical
df4.index = df4.index.astype(object).astype(cat_type1)
df1.merge(df4, left_on="foo", right_on="foo")
Expected Output
left right
foo
a 1 1
b 2 2
c 3 3
Output of pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 2.7.12.final.0
python-bits: 64
OS: Darwin
OS-release: 17.5.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: None.None
pandas: 0.23.4
pytest: 3.5.1
pip: 18.0
setuptools: 40.0.0
Cython: None
numpy: 1.15.0
scipy: 1.1.0
pyarrow: 0.10.0
xarray: None
IPython: 5.8.0
sphinx: 1.6.4
patsy: None
dateutil: 2.7.3
pytz: 2018.5
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.1.1
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 1.0.1
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: 0.1.5
fastparquet: None
pandas_gbq: None
pandas_datareader: None