Skip to content

Cannot merge on equal categorical dtypes #22501

Closed
@kmax12

Description

@kmax12

Code Sample, a copy-pastable example if possible

import pandas as pd
from pandas.api.types import CategoricalDtype

cat_type1 = CategoricalDtype(categories=["a", "b", "c"], ordered=False)
cat_type2 = CategoricalDtype(categories=["c", "b", "a"], ordered=False)

print('Check dtypes are equivalent:', cat_type1 == cat_type2)

# Test Data
df1 = pd.DataFrame({
    'foo': pd.Series(["a", "b", "c"]).astype(cat_type1),
    'left': [1, 2, 3],
}).set_index("foo")

df2 = pd.DataFrame({
    'foo': pd.Series(["c", "b", "a"]).astype(cat_type1),
    'right': [3, 2, 1],
}).set_index("foo")

df3 = pd.DataFrame({
    'foo': pd.Series(["a", "b", "c"]).astype(cat_type2),
    'right': [1, 2, 3],
}).set_index("foo")

df4 = pd.DataFrame({
    'foo': pd.Series(["c", "b", "a"]).astype(cat_type2),
    'right': [3, 2, 1],
}).set_index("foo")

# this merge succeeds
df1.merge(df2, left_index=True, right_index=True)

# this merge also succeeds
df1.merge(df3, left_index=True, right_index=True)

# this merge fails
df1.merge(df4, left_index=True, right_index=True)

this is the stacktrace

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-1-8540ed93314b> in <module>()
     35 
     36 # this merge fails
---> 37 df1.merge(df4, left_index=True, right_index=True)

/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py in merge(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
   6387                      right_on=right_on, left_index=left_index,
   6388                      right_index=right_index, sort=sort, suffixes=suffixes,
-> 6389                      copy=copy, indicator=indicator, validate=validate)
   6390 
   6391     def round(self, decimals=0, *args, **kwargs):

/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
     60                          copy=copy, indicator=indicator,
     61                          validate=validate)
---> 62     return op.get_result()
     63 
     64 

/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/merge.py in get_result(self)
    566                 self.left, self.right)
    567 
--> 568         join_index, left_indexer, right_indexer = self._get_join_info()
    569 
    570         ldata, rdata = self.left._data, self.right._data

/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/merge.py in _get_join_info(self)
    763             join_index, left_indexer, right_indexer = \
    764                 left_ax.join(right_ax, how=self.how, return_indexers=True,
--> 765                              sort=self.sort)
    766         elif self.right_index and self.how == 'left':
    767             join_index, left_indexer, right_indexer = \

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in join(self, other, how, level, return_indexers, sort)
   3787             try:
   3788                 return self._join_monotonic(other, how=how,
-> 3789                                             return_indexers=return_indexers)
   3790             except TypeError:
   3791                 pass

/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py in _join_monotonic(self, other, how, return_indexers)
   4033                 ridx = None
   4034             elif how == 'inner':
-> 4035                 join_index, lidx, ridx = self._inner_indexer(sv, ov)
   4036                 join_index = self._wrap_joined_index(join_index, other)
   4037             elif how == 'outer':

pandas/_libs/join_helper.pxi in pandas._libs.join.inner_join_indexer_object()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'signed char'

Problem description

Merging on two equal unordered categorical dtypes fails depending on the order of records in the dataframe.

One work around I found to make it work is to case index to object and then back to categorical

df4.index = df4.index.astype(object).astype(cat_type1)
df1.merge(df4, left_on="foo", right_on="foo")

Expected Output

     left  right
foo             
a       1      1
b       2      2
c       3      3

Output of pd.show_versions()

INSTALLED VERSIONS

commit: None
python: 2.7.12.final.0
python-bits: 64
OS: Darwin
OS-release: 17.5.0
machine: x86_64
processor: i386
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: None.None

pandas: 0.23.4
pytest: 3.5.1
pip: 18.0
setuptools: 40.0.0
Cython: None
numpy: 1.15.0
scipy: 1.1.0
pyarrow: 0.10.0
xarray: None
IPython: 5.8.0
sphinx: 1.6.4
patsy: None
dateutil: 2.7.3
pytz: 2018.5
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 2.1.1
openpyxl: None
xlrd: None
xlwt: None
xlsxwriter: None
lxml: None
bs4: None
html5lib: 1.0.1
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: 0.1.5
fastparquet: None
pandas_gbq: None
pandas_datareader: None

Metadata

Metadata

Assignees

No one assigned

    Labels

    CategoricalCategorical Data TypeNeeds TestsUnit test(s) needed to prevent regressionsReshapingConcat, Merge/Join, Stack/Unstack, Explodegood first issue

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions