Skip to content

Commit be0b61c

Browse files
authored
Restrict Pandas merge suffixes param type to list/tuple to avoid interchange in right and left suffix order (#34208)
1 parent 9012e1e commit be0b61c

File tree

5 files changed

+41
-24
lines changed

5 files changed

+41
-24
lines changed

doc/source/user_guide/merging.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1273,7 +1273,7 @@ columns:
12731273
12741274
.. ipython:: python
12751275
1276-
result = pd.merge(left, right, on='k', suffixes=['_l', '_r'])
1276+
result = pd.merge(left, right, on='k', suffixes=('_l', '_r'))
12771277
12781278
.. ipython:: python
12791279
:suppress:

doc/source/whatsnew/v1.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ Backwards incompatible API changes
401401
- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string.
402402
- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError``
403403
(previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`)
404+
- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`)
404405

405406
``MultiIndex.get_indexer`` interprets `method` argument differently
406407
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pandas/core/reshape/merge.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -667,10 +667,8 @@ def get_result(self):
667667

668668
join_index, left_indexer, right_indexer = self._get_join_info()
669669

670-
lsuf, rsuf = self.suffixes
671-
672670
llabels, rlabels = _items_overlap_with_suffix(
673-
self.left._info_axis, lsuf, self.right._info_axis, rsuf
671+
self.left._info_axis, self.right._info_axis, self.suffixes
674672
)
675673

676674
lindexers = {1: left_indexer} if left_indexer is not None else {}
@@ -1484,10 +1482,8 @@ def __init__(
14841482
def get_result(self):
14851483
join_index, left_indexer, right_indexer = self._get_join_info()
14861484

1487-
lsuf, rsuf = self.suffixes
1488-
14891485
llabels, rlabels = _items_overlap_with_suffix(
1490-
self.left._info_axis, lsuf, self.right._info_axis, rsuf
1486+
self.left._info_axis, self.right._info_axis, self.suffixes
14911487
)
14921488

14931489
if self.fill_method == "ffill":
@@ -2067,17 +2063,26 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame":
20672063
)
20682064

20692065

2070-
def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix):
2066+
def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]):
20712067
"""
2068+
Suffixes type validation.
2069+
20722070
If two indices overlap, add suffixes to overlapping entries.
20732071
20742072
If corresponding suffix is empty, the entry is simply converted to string.
20752073
20762074
"""
2075+
if not isinstance(suffixes, tuple):
2076+
raise TypeError(
2077+
f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}"
2078+
)
2079+
20772080
to_rename = left.intersection(right)
20782081
if len(to_rename) == 0:
20792082
return left, right
20802083

2084+
lsuffix, rsuffix = suffixes
2085+
20812086
if not lsuffix and not rsuffix:
20822087
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
20832088

pandas/tests/reshape/merge/test_join.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def test_inner_join(self):
162162
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner")
163163

164164
def test_handle_overlap(self):
165-
joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"])
165+
joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))
166166

167167
assert "key1.foo" in joined
168168
assert "key1.bar" in joined
@@ -173,7 +173,7 @@ def test_handle_overlap_arbitrary_key(self):
173173
self.df2,
174174
left_on="key2",
175175
right_on="key1",
176-
suffixes=[".foo", ".bar"],
176+
suffixes=(".foo", ".bar"),
177177
)
178178
assert "key1.foo" in joined
179179
assert "key2.bar" in joined

pandas/tests/reshape/merge/test_merge.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,8 +2004,8 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
20042004
("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]),
20052005
("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]),
20062006
("a", "b", dict(suffixes=("_x", None)), ["a", "b"]),
2007-
("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]),
2008-
(0, 0, dict(suffixes=["_a", None]), ["0_a", 0]),
2007+
("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]),
2008+
(0, 0, dict(suffixes=("_a", None)), ["0_a", 0]),
20092009
("a", "a", dict(), ["a_x", "a_y"]),
20102010
(0, 0, dict(), ["0_x", "0_y"]),
20112011
],
@@ -2056,13 +2056,7 @@ def test_merge_duplicate_suffix(how, expected):
20562056

20572057
@pytest.mark.parametrize(
20582058
"col1, col2, suffixes",
2059-
[
2060-
("a", "a", [None, None]),
2061-
("a", "a", (None, None)),
2062-
("a", "a", ("", None)),
2063-
(0, 0, [None, None]),
2064-
(0, 0, (None, "")),
2065-
],
2059+
[("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))],
20662060
)
20672061
def test_merge_suffix_error(col1, col2, suffixes):
20682062
# issue: 24782
@@ -2075,18 +2069,35 @@ def test_merge_suffix_error(col1, col2, suffixes):
20752069
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
20762070

20772071

2078-
@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)])
2079-
def test_merge_suffix_none_error(col1, col2, suffixes):
2080-
# issue: 24782
2072+
@pytest.mark.parametrize(
2073+
"col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)],
2074+
)
2075+
def test_merge_suffix_type_error(col1, col2, suffixes):
20812076
a = pd.DataFrame({col1: [1, 2, 3]})
20822077
b = pd.DataFrame({col2: [3, 4, 5]})
20832078

2084-
# TODO: might reconsider current raise behaviour, see GH24782
2085-
msg = "iterable"
2079+
msg = (
2080+
f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}"
2081+
)
20862082
with pytest.raises(TypeError, match=msg):
20872083
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
20882084

20892085

2086+
@pytest.mark.parametrize(
2087+
"col1, col2, suffixes, msg",
2088+
[
2089+
("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"),
2090+
("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"),
2091+
],
2092+
)
2093+
def test_merge_suffix_length_error(col1, col2, suffixes, msg):
2094+
a = pd.DataFrame({col1: [1, 2, 3]})
2095+
b = pd.DataFrame({col2: [3, 4, 5]})
2096+
2097+
with pytest.raises(ValueError, match=msg):
2098+
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
2099+
2100+
20902101
@pytest.mark.parametrize("cat_dtype", ["one", "two"])
20912102
@pytest.mark.parametrize("reverse", [True, False])
20922103
def test_merge_equal_cat_dtypes(cat_dtype, reverse):

0 commit comments

Comments
 (0)