From bf467a16c772b83ab10a130210cdfc51a421a58a Mon Sep 17 00:00:00 2001 From: ssche Date: Mon, 16 Aug 2021 16:58:10 +1000 Subject: [PATCH 1/4] Added test case (#39881) --- .../tests/frame/methods/test_combine_first.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index dd91b32c8eb8c..0aab2f0a38f72 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -492,3 +492,23 @@ def test_combine_preserve_dtypes(): ) combined = df1.combine_first(df2) tm.assert_frame_equal(combined, expected) + + +def test_combine_first_duplicates_rows_for_nan_index_values(): + # GH39881 + df1 = pd.DataFrame({"a": [1, 2, 3], "b": [np.nan, 5, 6], "x": [9, 10, 11]}) + df1 = df1.set_index(["a", "b"]) + + df2 = pd.DataFrame({"a": [1, 2, 4], "b": [np.nan, 5, 7], "y": [12, 13, 14]}) + df2 = df2.set_index(["a", "b"]) + + expected = DataFrame( + { + "a": [1, 2, 3, 4], + "b": [np.nan, 5, 6, 7], + "x": [9.0, 10., 11.0, np.nan], + "y": [12.0, 13.0, np.nan, 14.0], + } + ).set_index(["a", "b"]) + combined = df1.combine_first(df2) + tm.assert_frame_equal(combined, expected) From 7d0485b520e82620902673d58a66dffd439d41aa Mon Sep 17 00:00:00 2001 From: ssche Date: Mon, 16 Aug 2021 23:28:14 +1000 Subject: [PATCH 2/4] Fixed build error * "found both 'pd.DataFrame' and 'DataFrame' in the same file" --- pandas/tests/frame/methods/test_combine_first.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 0aab2f0a38f72..fa9074355a19a 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -496,10 +496,10 @@ def test_combine_preserve_dtypes(): def test_combine_first_duplicates_rows_for_nan_index_values(): # GH39881 - df1 = pd.DataFrame({"a": [1, 2, 3], "b": [np.nan, 5, 6], "x": [9, 10, 11]}) + df1 = DataFrame({"a": [1, 2, 3], "b": [np.nan, 5, 6], "x": [9, 10, 11]}) df1 = df1.set_index(["a", "b"]) - df2 = pd.DataFrame({"a": [1, 2, 4], "b": [np.nan, 5, 7], "y": [12, 13, 14]}) + df2 = DataFrame({"a": [1, 2, 4], "b": [np.nan, 5, 7], "y": [12, 13, 14]}) df2 = df2.set_index(["a", "b"]) expected = DataFrame( From b2478bf9120181255dab2da61fa774cafd755766 Mon Sep 17 00:00:00 2001 From: ssche Date: Mon, 16 Aug 2021 23:33:03 +1000 Subject: [PATCH 3/4] Fixed formatting --- pandas/tests/frame/methods/test_combine_first.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index fa9074355a19a..46ab41170d452 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -506,7 +506,7 @@ def test_combine_first_duplicates_rows_for_nan_index_values(): { "a": [1, 2, 3, 4], "b": [np.nan, 5, 6, 7], - "x": [9.0, 10., 11.0, np.nan], + "x": [9.0, 10.0, 11.0, np.nan], "y": [12.0, 13.0, np.nan, 14.0], } ).set_index(["a", "b"]) From 8a55f8564a420969828a2cdc8835d242fceab10e Mon Sep 17 00:00:00 2001 From: ssche Date: Tue, 17 Aug 2021 09:57:22 +1000 Subject: [PATCH 4/4] Addressed review comments * Explicitly define index in constructor --- .../tests/frame/methods/test_combine_first.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 46ab41170d452..382c11f23a517 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -496,19 +496,24 @@ def test_combine_preserve_dtypes(): def test_combine_first_duplicates_rows_for_nan_index_values(): # GH39881 - df1 = DataFrame({"a": [1, 2, 3], "b": [np.nan, 5, 6], "x": [9, 10, 11]}) - df1 = df1.set_index(["a", "b"]) + df1 = DataFrame( + {"x": [9, 10, 11]}, + index=MultiIndex.from_arrays([[1, 2, 3], [np.nan, 5, 6]], names=["a", "b"]), + ) - df2 = DataFrame({"a": [1, 2, 4], "b": [np.nan, 5, 7], "y": [12, 13, 14]}) - df2 = df2.set_index(["a", "b"]) + df2 = DataFrame( + {"y": [12, 13, 14]}, + index=MultiIndex.from_arrays([[1, 2, 4], [np.nan, 5, 7]], names=["a", "b"]), + ) expected = DataFrame( { - "a": [1, 2, 3, 4], - "b": [np.nan, 5, 6, 7], "x": [9.0, 10.0, 11.0, np.nan], "y": [12.0, 13.0, np.nan, 14.0], - } - ).set_index(["a", "b"]) + }, + index=MultiIndex.from_arrays( + [[1, 2, 3, 4], [np.nan, 5.0, 6.0, 7.0]], names=["a", "b"] + ), + ) combined = df1.combine_first(df2) tm.assert_frame_equal(combined, expected)