diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0c4fb6d3d1164..4f0660886b3c6 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -671,6 +671,7 @@ Reshaping - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) - Bug in :func:`merge_asof` raising incorrect error for string dtype (:issue:`56444`) - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) +- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a0de9319194cd..32c1720ae5517 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1354,8 +1354,12 @@ def _maybe_coerce_merge_keys(self) -> None: lk_is_cat = isinstance(lk.dtype, CategoricalDtype) rk_is_cat = isinstance(rk.dtype, CategoricalDtype) - lk_is_object = is_object_dtype(lk.dtype) - rk_is_object = is_object_dtype(rk.dtype) + lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype( + lk.dtype + ) + rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype( + rk.dtype + ) # if either left or right is a categorical # then the must match exactly in categories & ordered @@ -1452,14 +1456,14 @@ def _maybe_coerce_merge_keys(self) -> None: # incompatible dtypes GH 9780, GH 15800 # bool values are coerced to object - elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( - is_bool_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or ( + is_bool_dtype(lk.dtype) and rk_is_object_or_string ): pass # object values are allowed to be merged - elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( - is_numeric_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or ( + is_numeric_dtype(lk.dtype) and rk_is_object_or_string ): inferred_left = lib.infer_dtype(lk, skipna=False) inferred_right = lib.infer_dtype(rk, skipna=False) @@ -1498,7 +1502,7 @@ def _maybe_coerce_merge_keys(self) -> None: # allows datetime with different resolutions continue - elif lk_is_object and rk_is_object: + elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue # Houston, we have a problem! diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 4cc887c32b585..2eca3556de954 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -118,7 +120,10 @@ def test_handle_overlap_arbitrary_key(self, df, df2): assert "key1.foo" in joined assert "key2.bar" in joined - def test_join_on(self, target_source): + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_on(self, target_source, infer_string): target, source = target_source merged = target.join(source, on="C") @@ -150,8 +155,8 @@ def test_join_on(self, target_source): # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object columns for key 'A'. " - "If you wish to proceed you should use pd.concat" + "You are trying to merge on float64 and object|string columns for key " + "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A")