From de555f847d2878ff529a81543f5be800c1dc3ba1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 May 2023 16:45:25 +0200 Subject: [PATCH 1/6] FIX do not convert to object dtype with datetime of different resolution in merge --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/reshape/merge.py | 10 +++++++++- pandas/tests/reshape/merge/test_merge.py | 20 ++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index c8159be4d7b6b..cd3698e67e26d 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -33,6 +33,7 @@ Bug fixes - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) +- Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a96a08f18e81f..690cca68b67b9 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1395,6 +1395,14 @@ def _maybe_coerce_merge_keys(self) -> None: rk.dtype, DatetimeTZDtype ): raise ValueError(msg) + elif ( + isinstance(lk.dtype, DatetimeTZDtype) + and isinstance(rk.dtype, DatetimeTZDtype) + ) or ( + lk.dtype.kind == "M" and rk.dtype.kind == "M" + ): + # allows datetime with different resolutions + continue elif lk_is_object and rk_is_object: continue @@ -2352,7 +2360,7 @@ def _factorize_keys( if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared - # TODO(non-nano): need to make sure resolutions match + lk, rk = lk._ensure_matching_resos(rk) lk = cast("DatetimeArray", lk)._ndarray rk = cast("DatetimeArray", rk)._ndarray diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 017bf1c917e37..47751c26fec09 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -7,6 +7,7 @@ import numpy as np import pytest +import pytz from pandas.core.dtypes.common import is_object_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -2773,3 +2774,22 @@ def test_merge_arrow_and_numpy_dtypes(dtype): result = df2.merge(df) expected = df2.copy() tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("tzinfo", [None, pytz.timezone("America/Chicago")]) +def test_merge_datetime_different_resolution(tzinfo): + # https://github.com/pandas-dev/pandas/issues/53200 + df1 = pd.DataFrame({ + 't': [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], + 'a': [1], + }) + df2 = df1.copy() + df2['t'] = df2['t'].dt.as_unit('s') + + expected = pd.DataFrame({ + 't': [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], + 'a_x': [1], + 'a_y': [1], + }) + result = df1.merge(df2, on='t') + tm.assert_frame_equal(result, expected) From f49d58bb3bd1e0583b9c00c33a43c862923f41a7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 May 2023 17:01:00 +0200 Subject: [PATCH 2/6] precommit --- pandas/core/reshape/merge.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 690cca68b67b9..1ebbe68d7e7e1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1398,9 +1398,7 @@ def _maybe_coerce_merge_keys(self) -> None: elif ( isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype) - ) or ( - lk.dtype.kind == "M" and rk.dtype.kind == "M" - ): + ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): # allows datetime with different resolutions continue From 255f968df6ccd1355d8ce09ceab74e19aa84cadd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 May 2023 17:16:05 +0200 Subject: [PATCH 3/6] another file --- pandas/tests/reshape/merge/test_merge.py | 28 ++++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 47751c26fec09..a01903c2e785d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2779,17 +2779,21 @@ def test_merge_arrow_and_numpy_dtypes(dtype): @pytest.mark.parametrize("tzinfo", [None, pytz.timezone("America/Chicago")]) def test_merge_datetime_different_resolution(tzinfo): # https://github.com/pandas-dev/pandas/issues/53200 - df1 = pd.DataFrame({ - 't': [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], - 'a': [1], - }) + df1 = DataFrame( + { + "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], + "a": [1], + } + ) df2 = df1.copy() - df2['t'] = df2['t'].dt.as_unit('s') - - expected = pd.DataFrame({ - 't': [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], - 'a_x': [1], - 'a_y': [1], - }) - result = df1.merge(df2, on='t') + df2["t"] = df2["t"].dt.as_unit("s") + + expected = DataFrame( + { + "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], + "a_x": [1], + "a_y": [1], + } + ) + result = df1.merge(df2, on="t") tm.assert_frame_equal(result, expected) From 83bc00cf6e3d3fbdd7667efa93ff665afa4bf6c8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 May 2023 17:22:28 +0200 Subject: [PATCH 4/6] another file --- doc/source/whatsnew/v2.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index cd3698e67e26d..92847bef31778 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -28,12 +28,12 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was raising ``IndexError`` on empty categorical data (:issue:`53077`) - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) +- Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) - Bug in :func:`to_timedelta` was raising ``ValueError`` with ``pandas.NA`` (:issue:`52909`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) -- Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) .. --------------------------------------------------------------------------- From 046793332bd72811100179e5abf9e599bea1cf78 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 13 May 2023 22:33:47 +0200 Subject: [PATCH 5/6] cast to make mypy happy --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1ebbe68d7e7e1..7c046bddab4e8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2358,7 +2358,7 @@ def _factorize_keys( if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared - lk, rk = lk._ensure_matching_resos(rk) + lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) lk = cast("DatetimeArray", lk)._ndarray rk = cast("DatetimeArray", rk)._ndarray From a51d23af4956b53fb6221dddd28d7bf674b04e78 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 14 May 2023 14:41:07 +0200 Subject: [PATCH 6/6] TST set explicitely the unit to nanoseconds --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a01903c2e785d..088ebf6c88e6a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2781,7 +2781,7 @@ def test_merge_datetime_different_resolution(tzinfo): # https://github.com/pandas-dev/pandas/issues/53200 df1 = DataFrame( { - "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], + "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo, unit="ns")], "a": [1], } )