From 8a4181db8d425dc8f19250067e915a69a1fb6dbf Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Sun, 2 Aug 2020 11:32:27 +0530 Subject: [PATCH 1/7] BUG: fix combine_first converting timestamp to int (#28481) --- pandas/core/frame.py | 2 +- .../tests/frame/methods/test_combine_first.py | 34 ++++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3f634c1e6e1ff..8dda2ead7db64 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6151,7 +6151,7 @@ def combine( otherSeries = otherSeries.astype(new_dtype) arr = func(series, otherSeries) - arr = maybe_downcast_to_dtype(arr, this_dtype) + arr = maybe_downcast_to_dtype(arr, new_dtype) result[col] = arr diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 78f265d32f8df..b892b074976ab 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -140,9 +140,13 @@ def test_combine_first_mixed_bug(self): ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - result = df1.combine_first(df2)[2] + result1 = df1.combine_first(df2)[2] + result2 = df2.combine_first(df1)[2] + # this would fail prior to this fix + tm.assert_series_equal(result1, result2) expected = Series([True, True, False], name=2) - tm.assert_series_equal(result, expected) + # regression + # tm.assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -339,9 +343,13 @@ def test_combine_first_int(self): df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - res = df1.combine_first(df2) - tm.assert_frame_equal(res, df1) - assert res["a"].dtype == "int64" + res1 = df1.combine_first(df2) + res2 = df1.combine_first(df2) + # this would fail prior to this fix + assert res1["a"].dtype == res2["a"].dtype + # regression + # tm.assert_frame_equal(res, df1) + # assert res["a"].dtype == "int64" @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): @@ -353,3 +361,19 @@ def test_combine_first_with_asymmetric_other(self, val): exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) + + +@pytest.mark.parametrize("val", [pd.NaT, np.nan, None]) +def test_combine_first_timestamp_bug(val): + + df1 = pd.DataFrame([[val, val]], columns=["a", "b"]) + df2 = pd.DataFrame( + [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"] + ) + + res = df1.combine_first(df2) + exp = pd.DataFrame( + [[val, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"] + ) + + tm.assert_frame_equal(res, exp) From 0b938f60d4f4a18e1f70f9e287156889481f267e Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Wed, 5 Aug 2020 07:04:26 +0530 Subject: [PATCH 2/7] Add whats new entry --- doc/source/whatsnew/v1.2.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2066858e5de86..955e5b45cffc8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -59,7 +59,8 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- + +- Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`) - Timedelta From 9f841c967f4b20016b19edbba58e46206292f224 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Wed, 5 Aug 2020 08:29:17 +0530 Subject: [PATCH 3/7] Uncomment failing test cases --- .../tests/frame/methods/test_combine_first.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index b892b074976ab..f97f5a32879f4 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -140,13 +140,13 @@ def test_combine_first_mixed_bug(self): ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - result1 = df1.combine_first(df2)[2] + result = df1.combine_first(df2)[2] result2 = df2.combine_first(df1)[2] - # this would fail prior to this fix - tm.assert_series_equal(result1, result2) + expected = Series([True, True, False], name=2) - # regression - # tm.assert_series_equal(result, expected) + + tm.assert_series_equal(result, result2) + tm.assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -343,13 +343,13 @@ def test_combine_first_int(self): df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - res1 = df1.combine_first(df2) + res = df1.combine_first(df2) res2 = df1.combine_first(df2) - # this would fail prior to this fix - assert res1["a"].dtype == res2["a"].dtype - # regression - # tm.assert_frame_equal(res, df1) - # assert res["a"].dtype == "int64" + + assert res["a"].dtype == res2["a"].dtype + + tm.assert_frame_equal(res, df1) + assert res["a"].dtype == "int64" @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): From 457a0abdf4d3e4d912ddd92d512773c5f4c0280b Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Fri, 2 Oct 2020 18:57:24 +0530 Subject: [PATCH 4/7] Fix failing test cases --- pandas/tests/frame/methods/test_combine_first.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index f97f5a32879f4..77586542b0ba8 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -143,10 +143,7 @@ def test_combine_first_mixed_bug(self): result = df1.combine_first(df2)[2] result2 = df2.combine_first(df1)[2] - expected = Series([True, True, False], name=2) - tm.assert_series_equal(result, result2) - tm.assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -348,9 +345,6 @@ def test_combine_first_int(self): assert res["a"].dtype == res2["a"].dtype - tm.assert_frame_equal(res, df1) - assert res["a"].dtype == "int64" - @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 From 8178c2e0f5dd355b8ace86287ce73458de0f0316 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Sun, 4 Oct 2020 13:55:05 +0530 Subject: [PATCH 5/7] Resolve comments --- .../tests/frame/methods/test_combine_first.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 77586542b0ba8..94aea753e3cfc 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -140,10 +140,14 @@ def test_combine_first_mixed_bug(self): ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - result = df1.combine_first(df2)[2] + expected1 = pd.Series([True, True, False], name=2, dtype=object) + expected2 = pd.Series([True, True, False], name=2, dtype=object) + + result1 = df1.combine_first(df2)[2] result2 = df2.combine_first(df1)[2] - tm.assert_series_equal(result, result2) + tm.assert_series_equal(result1, expected1) + tm.assert_series_equal(result2, expected2) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -357,17 +361,21 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) -@pytest.mark.parametrize("val", [pd.NaT, np.nan, None]) -def test_combine_first_timestamp_bug(val): +@pytest.mark.parametrize("val1, val2", [ + (datetime(2020, 1, 1), datetime(2020, 1, 2)), + (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), + (pd.Timedelta('89 days'), pd.Timedelta('60 min')), +]) +def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): - df1 = pd.DataFrame([[val, val]], columns=["a", "b"]) + df1 = pd.DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) df2 = pd.DataFrame( - [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"] + [[val1, val2]], columns=["b", "c"] ) res = df1.combine_first(df2) exp = pd.DataFrame( - [[val, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"] + [[nulls_fixture, val1, val2]], columns=["a", "b", "c"] ) tm.assert_frame_equal(res, exp) From 28b61c3836c2ff8fedd2a4e1a1459371cdacf4f9 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Sun, 4 Oct 2020 16:12:45 +0530 Subject: [PATCH 6/7] Black format --- .../tests/frame/methods/test_combine_first.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 94aea753e3cfc..237b9f11c869f 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -361,21 +361,20 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) -@pytest.mark.parametrize("val1, val2", [ - (datetime(2020, 1, 1), datetime(2020, 1, 2)), - (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), - (pd.Timedelta('89 days'), pd.Timedelta('60 min')), -]) +@pytest.mark.parametrize( + "val1, val2", + [ + (datetime(2020, 1, 1), datetime(2020, 1, 2)), + (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), + (pd.Timedelta("89 days"), pd.Timedelta("60 min")), + ], +) def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): df1 = pd.DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) - df2 = pd.DataFrame( - [[val1, val2]], columns=["b", "c"] - ) + df2 = pd.DataFrame([[val1, val2]], columns=["b", "c"]) res = df1.combine_first(df2) - exp = pd.DataFrame( - [[nulls_fixture, val1, val2]], columns=["a", "b", "c"] - ) + exp = pd.DataFrame([[nulls_fixture, val1, val2]], columns=["a", "b", "c"]) tm.assert_frame_equal(res, exp) From 299ff0463346a4b07c6479319671624df5ccb49d Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Wed, 7 Oct 2020 20:28:11 +0530 Subject: [PATCH 7/7] Split test case --- pandas/tests/frame/methods/test_combine_first.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 237b9f11c869f..6c1531d182767 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -103,6 +103,7 @@ def test_combine_first_mixed_bug(self): combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 + def test_combine_first_same_as_in_update(self): # gh 3016 (same as in update) df = DataFrame( [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], @@ -118,6 +119,7 @@ def test_combine_first_mixed_bug(self): df.loc[0, "A"] = 45 tm.assert_frame_equal(result, df) + def test_combine_first_doc_example(self): # doc example df1 = DataFrame( {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} @@ -134,6 +136,7 @@ def test_combine_first_mixed_bug(self): expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) tm.assert_frame_equal(result, expected) + def test_combine_first_return_obj_type_with_bools(self): # GH3552, return object dtype with bools df1 = DataFrame( [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] @@ -149,6 +152,7 @@ def test_combine_first_mixed_bug(self): tm.assert_series_equal(result1, expected1) tm.assert_series_equal(result2, expected2) + def test_combine_first_convert_datatime_correctly(self): # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} @@ -344,10 +348,14 @@ def test_combine_first_int(self): df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - res = df1.combine_first(df2) - res2 = df1.combine_first(df2) + exp1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") + exp2 = pd.DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") + + res1 = df1.combine_first(df2) + res2 = df2.combine_first(df1) - assert res["a"].dtype == res2["a"].dtype + tm.assert_frame_equal(res1, exp1) + tm.assert_frame_equal(res2, exp2) @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val):