From f81c4ee66522d69739b51fc4d32265f56606f4b5 Mon Sep 17 00:00:00 2001 From: Nico Cernek Date: Fri, 2 Aug 2019 14:12:31 -0600 Subject: [PATCH 01/11] add failing test to check row order preservation correct the imports broken commit with a bunch of print statements and comments add test for left merge swap left and right keys when how == "right" correct old test: right-merge row order is now the same as the right df clean up spacing and delete temp code add whatsnew replace .from_records with default constructor add GH issue # to tests revert commit ed54bec7e change logic to swap left and right if how==right clean formatting rename vars and add comment for clarity combine tests into one update whatsnew Update doc/source/whatsnew/v1.0.0.rst Co-Authored-By: William Ayd add before and after examples linting cleanup changes requested by jreback update docs --- doc/source/whatsnew/v1.0.0.rst | 5 ++ pandas/core/reshape/merge.py | 20 +++++-- pandas/tests/reshape/merge/test_merge.py | 76 +++++++++++++++++++++--- 3 files changed, 87 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6597b764581a4..fa81edb4a3448 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1244,8 +1244,13 @@ Reshaping - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) +<<<<<<< HEAD - Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) - Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) +- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) +======= +>>>>>>> 2b1b67592... changes requested by jreback +- Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 49ac1b6cfa52b..898fa77a889ee 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -567,10 +567,10 @@ def __init__( indicator: bool = False, validate=None, ): - _left = _validate_operand(left) - _right = _validate_operand(right) - self.left = self.orig_left = _left - self.right = self.orig_right = _right + left = validate_operand(left) + right = validate_operand(right) + self.left = self.orig_left = left + self.right = self.orig_right = right self.how = how self.axis = axis @@ -1292,6 +1292,9 @@ def _get_join_indexers( right_keys ), "left_key and right_keys must be the same length" + # bind `sort` arg. of _factorize_keys + fkeys = partial(_factorize_keys, sort=sort) + # get left & right join labels and num. of levels at each location mapped = ( _factorize_keys(left_keys[n], right_keys[n], sort=sort) @@ -1306,15 +1309,20 @@ def _get_join_indexers( # factorize keys to a dense i8 space # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) + # flip left and right keys if performing a right merge + # to preserve right merge row order (GH 27453) + if how == "right": + factorized_rkey, factorized_lkey, count = fkeys(rkey, lkey) + else: + factorized_lkey, factorized_rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": kwargs["sort"] = sort join_func = _join_functions[how] - return join_func(lkey, rkey, count, **kwargs) + return join_func(factorized_lkey, factorized_rkey, count, **kwargs) def _restore_dropped_levels_multijoin( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index fd189c7435b29..53f18479d1729 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1286,17 +1286,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) - df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) + df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on="key", right_index=True, how=how) expected = pd.DataFrame( [ - [1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5], + [0, 0, 0], + [1, 1, 1], + [2, 2, 2], + [np.nan, 3, 3], + [np.nan, 4, 4], + [np.nan, 5, 5], ], columns=["a", "key", "b"], ) @@ -2167,3 +2167,63 @@ def test_merge_datetime_upcast_dtype(): } ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["left", "right"]) +def test_merge_preserves_row_order(how): + # GH 27453 + population = [ + ("Jenn", "Jamaica", 3), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + columns = ["name", "country", "population"] + population_df = DataFrame(population, columns=columns) + + people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] + columns = ["name", "country"] + people_df = DataFrame(people, columns=columns) + + expected_data = [ + ("Abe", "America", np.nan), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + expected_cols = ["name", "country", "population"] + expected = DataFrame(expected_data, columns=expected_cols) + + result = pop.merge(ppl, on=("name", "country"), how="right") + + tm.assert_frame_equal(result, expected) + + +def test_left_merge_preserves_row_order(): + # GH 27453 + population = [ + ("Jenn", "Jamaica", 3), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + columns = ["name", "country", "population"] + pop = DataFrame(population, columns=columns) + + people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] + columns = ["name", "country"] + ppl = DataFrame(people, columns=columns) + + expected_data = [ + ("Abe", "America", np.nan), + ("Beth", "Bulgaria", 7), + ("Carl", "Canada", 30), + ] + expected_cols = ["name", "country", "population"] + expected = DataFrame(expected_data, columns=expected_cols) + + result = ppl.merge(pop, on=("name", "country"), how="left") + if how == "right": + left_df, right_df = population_df, people_df + elif how == "left": + left_df, right_df = people_df, population_df + + result = left_df.merge(right_df, on=("name", "country"), how=how) + tm.assert_frame_equal(expected, result) From 25f7e034b613b26ee5059e03e97600be2ff4fab6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 24 Jan 2020 11:22:26 +0000 Subject: [PATCH 02/11] :twisted_rightwards_arrows: fix conflicts, fix nameerror in tests --- doc/source/whatsnew/v1.0.0.rst | 4 ---- pandas/core/reshape/merge.py | 8 +++---- pandas/tests/reshape/merge/test_merge.py | 28 ------------------------ 3 files changed, 4 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index fa81edb4a3448..2a59a78c04aa2 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1244,13 +1244,9 @@ Reshaping - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) -<<<<<<< HEAD - Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) - Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) - :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) -======= ->>>>>>> 2b1b67592... changes requested by jreback -- Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 898fa77a889ee..46c591f423559 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -567,10 +567,10 @@ def __init__( indicator: bool = False, validate=None, ): - left = validate_operand(left) - right = validate_operand(right) - self.left = self.orig_left = left - self.right = self.orig_right = right + _left = _validate_operand(left) + _right = _validate_operand(right) + self.left = self.orig_left = _left + self.right = self.orig_right = _right self.how = how self.axis = axis diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 53f18479d1729..964277e8a1ac9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2192,34 +2192,6 @@ def test_merge_preserves_row_order(how): expected_cols = ["name", "country", "population"] expected = DataFrame(expected_data, columns=expected_cols) - result = pop.merge(ppl, on=("name", "country"), how="right") - - tm.assert_frame_equal(result, expected) - - -def test_left_merge_preserves_row_order(): - # GH 27453 - population = [ - ("Jenn", "Jamaica", 3), - ("Beth", "Bulgaria", 7), - ("Carl", "Canada", 30), - ] - columns = ["name", "country", "population"] - pop = DataFrame(population, columns=columns) - - people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] - columns = ["name", "country"] - ppl = DataFrame(people, columns=columns) - - expected_data = [ - ("Abe", "America", np.nan), - ("Beth", "Bulgaria", 7), - ("Carl", "Canada", 30), - ] - expected_cols = ["name", "country", "population"] - expected = DataFrame(expected_data, columns=expected_cols) - - result = ppl.merge(pop, on=("name", "country"), how="left") if how == "right": left_df, right_df = population_df, people_df elif how == "left": From a53cc2216d38b9ab5999c06930c8c20452136609 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 30 Jan 2020 12:55:52 +0000 Subject: [PATCH 03/11] :pencil: add whatsnew entry to v1.0.1 --- doc/source/whatsnew/v1.0.0.rst | 1 - pandas/tests/reshape/merge/test_merge.py | 34 ++++++++++++------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2a59a78c04aa2..6597b764581a4 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1246,7 +1246,6 @@ Reshaping - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) - Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) - Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) -- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) Sparse ^^^^^^ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 964277e8a1ac9..dbcdea1d454bb 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2172,25 +2172,25 @@ def test_merge_datetime_upcast_dtype(): @pytest.mark.parametrize("how", ["left", "right"]) def test_merge_preserves_row_order(how): # GH 27453 - population = [ - ("Jenn", "Jamaica", 3), - ("Beth", "Bulgaria", 7), - ("Carl", "Canada", 30), - ] - columns = ["name", "country", "population"] - population_df = DataFrame(population, columns=columns) + population_df = DataFrame( + { + "name": ["Jenn", "Beth", "Carl"], + "country": ["Jamaica", "Bulgaria", "Canada"], + "population": [3, 7, 30], + } + ) - people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")] - columns = ["name", "country"] - people_df = DataFrame(people, columns=columns) + people_df = DataFrame( + {"name": ["Abe", "Beth", "Carl"], "country": ["America", "Bulgaria", "Canada"]} + ) - expected_data = [ - ("Abe", "America", np.nan), - ("Beth", "Bulgaria", 7), - ("Carl", "Canada", 30), - ] - expected_cols = ["name", "country", "population"] - expected = DataFrame(expected_data, columns=expected_cols) + expected = DataFrame( + { + "name": ["Abe", "Beth", "Carl"], + "country": ["America", "Bulgaria", "Canada"], + "population": [np.nan, 7, 30], + } + ) if how == "right": left_df, right_df = population_df, people_df From 2d77a5c492c1e7712f7f5cc3199389bed2dc3fcf Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 13 Feb 2020 13:31:08 +0000 Subject: [PATCH 04/11] pass to _factorize_keys --- pandas/core/reshape/merge.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 46c591f423559..924cb06011a46 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1292,12 +1292,9 @@ def _get_join_indexers( right_keys ), "left_key and right_keys must be the same length" - # bind `sort` arg. of _factorize_keys - fkeys = partial(_factorize_keys, sort=sort) - # get left & right join labels and num. of levels at each location mapped = ( - _factorize_keys(left_keys[n], right_keys[n], sort=sort) + _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) for n in range(len(left_keys)) ) zipped = zip(*mapped) @@ -1310,19 +1307,14 @@ def _get_join_indexers( # `count` is the num. of unique keys # set(lkey) | set(rkey) == range(count) - # flip left and right keys if performing a right merge - # to preserve right merge row order (GH 27453) - if how == "right": - factorized_rkey, factorized_lkey, count = fkeys(rkey, lkey) - else: - factorized_lkey, factorized_rkey, count = fkeys(lkey, rkey) + lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) if how == "left": kwargs["sort"] = sort join_func = _join_functions[how] - return join_func(factorized_lkey, factorized_rkey, count, **kwargs) + return join_func(lkey, rkey, count, **kwargs) def _restore_dropped_levels_multijoin( @@ -1858,7 +1850,7 @@ def _right_outer_join(x, y, max_groups): } -def _factorize_keys(lk, rk, sort=True): +def _factorize_keys(lk, rk, sort=True, how="inner"): # Some pre-processing for non-ndarray lk / rk if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): lk = getattr(lk, "_values", lk)._data @@ -1927,6 +1919,8 @@ def _factorize_keys(lk, rk, sort=True): np.putmask(rlab, rmask, count) count += 1 + if how == "right": + return rlab, llab, count return llab, rlab, count From bab654ecc96c371ef7783395cc7fc1dea466288f Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 13 Feb 2020 13:58:50 +0000 Subject: [PATCH 05/11] Add tests with merging on index, using original OP's example as test --- doc/source/whatsnew/v1.1.0.rst | 21 +++++++ pandas/tests/reshape/merge/test_merge.py | 70 +++++++++++++----------- 2 files changed, 59 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 13827e8fc4c33..c039ff10422c5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -67,6 +67,27 @@ Backwards incompatible API changes - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) - +:meth:`DataFrame.merge` preserves right frame's row order +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) + +.. ipython:: python + left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) + right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) + left_df + right_df +*pandas 1.0.x* + +.. code-block:: python + >>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right") + animal max_speed + 0 pig 11 + 1 quetzal 80 +*pandas 1.1.0* + +.. ipython:: python + left_df.merge(right_df, on=['animal', 'max_speed'], how="right") + .. --------------------------------------------------------------------------- .. _whatsnew_110.deprecations: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dbcdea1d454bb..cc9c450a6c1de 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1318,6 +1318,44 @@ def test_merge_right_index_right(self): result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("how", ["left", "right"]) + def test_merge_preserves_row_order(self, how): + # GH 27453 + a = [2, 5, 3, 5] + df1 = pd.DataFrame({"A": a, "B": [8, 2, 4, 1]}) + df2 = pd.DataFrame({"A": a, "B": [7, 1, 3, 0]}) + + result = df1.merge(df2[["A", "B"]], on=["A", "B"], how=how) + expected = pd.DataFrame({"A": a}) + if how == "right": + expected["B"] = df2["B"] + else: + expected["B"] = df1["B"] + tm.assert_frame_equal(result, expected) + + left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1])) + right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0])) + result = left_df.merge(right_df, left_index=True, right_index=True, how=how) + if how == "right": + expected = pd.DataFrame( + {"colors": ["red", "blue"], "hats": ["small", "big"]} + ) + else: + expected = pd.DataFrame( + {"colors": ["blue", "red"], "hats": ["big", "small"]} + ) + + left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) + result = left_df.merge(right_df, on=["animal", "max_speed"], how=how) + if how == "right": + expected = pd.DataFrame( + {"animal": ["quetzal", "pig"], "max_speed": [80, 11]} + ) + else: + expected = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) + tm.assert_frame_equal(result, expected) + def test_merge_take_missing_values_from_index_of_other_dtype(self): # GH 24212 left = pd.DataFrame( @@ -2167,35 +2205,3 @@ def test_merge_datetime_upcast_dtype(): } ) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("how", ["left", "right"]) -def test_merge_preserves_row_order(how): - # GH 27453 - population_df = DataFrame( - { - "name": ["Jenn", "Beth", "Carl"], - "country": ["Jamaica", "Bulgaria", "Canada"], - "population": [3, 7, 30], - } - ) - - people_df = DataFrame( - {"name": ["Abe", "Beth", "Carl"], "country": ["America", "Bulgaria", "Canada"]} - ) - - expected = DataFrame( - { - "name": ["Abe", "Beth", "Carl"], - "country": ["America", "Bulgaria", "Canada"], - "population": [np.nan, 7, 30], - } - ) - - if how == "right": - left_df, right_df = population_df, people_df - elif how == "left": - left_df, right_df = people_df, population_df - - result = left_df.merge(right_df, on=("name", "country"), how=how) - tm.assert_frame_equal(expected, result) From 714f5b4c8aec32868d299dfca3fe665d4ab789b5 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 13 Feb 2020 14:30:31 +0000 Subject: [PATCH 06/11] fix whatsnew ipython directive --- doc/source/whatsnew/v1.1.0.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c039ff10422c5..2b3644955a494 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -72,20 +72,25 @@ Backwards incompatible API changes :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) .. ipython:: python + left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) left_df right_df + *pandas 1.0.x* .. code-block:: python + >>> left_df.merge(right_df, on=['animal', 'max_speed'], how="right") animal max_speed 0 pig 11 1 quetzal 80 + *pandas 1.1.0* .. ipython:: python + left_df.merge(right_df, on=['animal', 'max_speed'], how="right") .. --------------------------------------------------------------------------- From c83f46f585dad11295f998725e4dbf6fa0f9f663 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 15 Mar 2020 15:03:44 +0000 Subject: [PATCH 07/11] Add docstring and types --- pandas/core/reshape/merge.py | 52 +++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a9099eb81ec87..9418a55be41ee 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1822,7 +1822,57 @@ def _right_outer_join(x, y, max_groups): return left_indexer, right_indexer -def _factorize_keys(lk, rk, sort=True, how="inner"): +def _factorize_keys(lk, rk, sort=True, how="inner") -> Tuple[np.array, np.array, int]: + """ + Encode left and right keys as enumerated types. + + This is used to get the join indexers to be used when merging DataFrames. + + Parameters + ---------- + lk : array-like + Left key. + rk : array-like + Right key. + sort : bool, defaults to True + If True, the encoding is done such that the unique elements in the + keys are sorted. + how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ + Type of merge. + + Returns + ------- + array + Left (resp. right if called with `key='right'`) labels, as enumerated type. + array + Right (resp. left if called with `key='right'`) labels, as enumerated type. + int + Number of unique elements in union of left and right labels. + + See Also + -------- + merge : Merge DataFrame or named Series objects + with a database-style join. + algorithms.factorize : Encode the object as an enumerated type + or categorical variable. + + Examples + -------- + >>> lk = np.array(["a", "c", "b"]) + >>> rk = np.array(["a", "c"]) + + Here, the unique values are `'a', 'b', 'c'`. With the default + `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk) + (array([0, 2, 1]), array([0, 2]), 3) + + With the `sort=False`, the encoding will correspond to the order + in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: + + >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) + (array([0, 1, 2]), array([0, 1]), 3) + """ # Some pre-processing for non-ndarray lk / rk lk = extract_array(lk, extract_numpy=True) rk = extract_array(rk, extract_numpy=True) From 65c32261945d1ef2936588fef5b2ab8dbe915ce7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 15 Mar 2020 15:19:31 +0000 Subject: [PATCH 08/11] remove unnecessary test, remove test that doesn't fail on master --- pandas/tests/reshape/merge/test_merge.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8ca63819ecb31..a6a76a1078667 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1321,30 +1321,6 @@ def test_merge_right_index_right(self): @pytest.mark.parametrize("how", ["left", "right"]) def test_merge_preserves_row_order(self, how): # GH 27453 - a = [2, 5, 3, 5] - df1 = pd.DataFrame({"A": a, "B": [8, 2, 4, 1]}) - df2 = pd.DataFrame({"A": a, "B": [7, 1, 3, 0]}) - - result = df1.merge(df2[["A", "B"]], on=["A", "B"], how=how) - expected = pd.DataFrame({"A": a}) - if how == "right": - expected["B"] = df2["B"] - else: - expected["B"] = df1["B"] - tm.assert_frame_equal(result, expected) - - left_df = pd.DataFrame({"colors": ["blue", "red"]}, index=pd.Index([0, 1])) - right_df = pd.DataFrame({"hats": ["small", "big"]}, index=pd.Index([1, 0])) - result = left_df.merge(right_df, left_index=True, right_index=True, how=how) - if how == "right": - expected = pd.DataFrame( - {"colors": ["red", "blue"], "hats": ["small", "big"]} - ) - else: - expected = pd.DataFrame( - {"colors": ["blue", "red"], "hats": ["big", "small"]} - ) - left_df = pd.DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]}) right_df = pd.DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]}) result = left_df.merge(right_df, on=["animal", "max_speed"], how=how) From 0e2c529730503783aeeaaefc3c634f09ea8e5e74 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 15 Mar 2020 18:01:45 +0000 Subject: [PATCH 09/11] Add docstring --- pandas/core/reshape/merge.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9418a55be41ee..df3e70ea2316d 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1822,7 +1822,9 @@ def _right_outer_join(x, y, max_groups): return left_indexer, right_indexer -def _factorize_keys(lk, rk, sort=True, how="inner") -> Tuple[np.array, np.array, int]: +def _factorize_keys( + lk, rk, sort: bool = True, how: str = "inner" +) -> Tuple[np.array, np.array, int]: """ Encode left and right keys as enumerated types. @@ -1884,7 +1886,7 @@ def _factorize_keys(lk, rk, sort=True, how="inner") -> Tuple[np.array, np.array, rk, _ = rk._values_for_factorize() elif ( - is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) + is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) ): if lk.categories.equals(rk.categories): # if we exactly match in categories, allow us to factorize on codes From 711d37c5fba8d5b0f9fc25a3ffce300e2e60f23d Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Wed, 25 Mar 2020 23:54:13 +0000 Subject: [PATCH 10/11] use cast in categorical case --- pandas/core/reshape/merge.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4400de1c945bd..6e024560ea2e4 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -6,14 +6,14 @@ import datetime from functools import partial import string -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Tuple, Union, cast import warnings import numpy as np from pandas._libs import Timedelta, hashtable as libhashtable, lib import pandas._libs.join as libjoin -from pandas._typing import FrameOrSeries +from pandas._typing import ArrayLike, FrameOrSeries from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -24,6 +24,7 @@ is_array_like, is_bool, is_bool_dtype, + is_categorical, is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal, @@ -1823,7 +1824,7 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys( - lk, rk, sort: bool = True, how: str = "inner" + lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" ) -> Tuple[np.array, np.array, int]: """ Encode left and right keys as enumerated types. @@ -1888,6 +1889,9 @@ def _factorize_keys( elif ( is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) ): + assert is_categorical(lk) and is_categorical(rk) + lk = cast(Categorical, lk) + rk = cast(Categorical, rk) if lk.categories.equals(rk.categories): # if we exactly match in categories, allow us to factorize on codes rk = rk.codes From aaf542eba45ea4091f4fe93cab3a7aaadf0b8f5a Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Thu, 26 Mar 2020 12:40:42 +0000 Subject: [PATCH 11/11] reword titles in whatsnew (v1.0.x -> previous behavior) --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a0119d15b3715..e0d60e56796dd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -179,7 +179,7 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss left_df right_df -*pandas 1.0.x* +*Previous behavior*: .. code-block:: python @@ -188,7 +188,7 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss 0 pig 11 1 quetzal 80 -*pandas 1.1.0* +*New behavior*: .. ipython:: python