From 6a06a31606b7925f5ec8f47d4651706eead0ab0f Mon Sep 17 00:00:00 2001 From: Petroncini Date: Wed, 11 Sep 2024 17:04:59 -0300 Subject: [PATCH 01/11] solved groupbyany bug --- pandas/core/groupby/ops.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index da80969b613cd..15db407e11fc2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -374,6 +374,13 @@ def _call_cython_op( if is_datetimelike: values = values.view("int64") is_numeric = True + + # Fix for NaT handling: ensure NaT is treated as False in any() and all() + if self.how in ["any", "all"]: + # Set NaT (which is represented as the smallest int64) to False (0) + nat_mask = values == np.iinfo(np.int64).min + values[nat_mask] = 0 # Treat NaT as False + elif dtype.kind == "b": values = values.view("uint8") if values.dtype == "float16": From 28b5450829be3bdb1ede6130b465a0f5e4ad16c3 Mon Sep 17 00:00:00 2001 From: Petroncini Date: Wed, 11 Sep 2024 17:10:34 -0300 Subject: [PATCH 02/11] added test for groupby --- pandas/tests/groupby/test_grouping.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index fc2a8a970010a..b8ef07b1fdeb9 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1180,3 +1180,19 @@ def test_grouping_by_key_is_in_axis(): result = gb.sum() expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]}) tm.assert_frame_equal(result, expected) + + +def test_groupby_any_with_timedelta(self): + # Create a DataFrame with Timedelta and NaT values + df = DataFrame({ + "A": ["foo", "foo", "bar", "bar"], + "B": [pd.Timedelta(1, unit='D'), pd.NaT, pd.Timedelta(2, unit='D'), pd.NaT] + }) + + # Group by column A and check if any Timedelta exists (i.e., non-NaT) + result = df.groupby("A")["B"].any() + + # Expected result: groups with only NaT should return False, others should return True + expected = Series([True, False], index=["foo", "bar"], name="B") + + tm.assert_series_equal(result, expected) From ca406527560bfb9a31d29ab8a84fe5bff006e216 Mon Sep 17 00:00:00 2001 From: Petroncini Date: Tue, 24 Sep 2024 22:05:52 -0300 Subject: [PATCH 03/11] changed ops.py to correct mypy error --- pandas/core/groupby/ops.py | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 15db407e11fc2..1e2021dec200a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -371,34 +371,29 @@ def _call_cython_op( is_datetimelike = dtype.kind in "mM" - if is_datetimelike: - values = values.view("int64") + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + values = values.astype(bool, copy=False).view(np.int8) is_numeric = True - # Fix for NaT handling: ensure NaT is treated as False in any() and all() - if self.how in ["any", "all"]: - # Set NaT (which is represented as the smallest int64) to False (0) - nat_mask = values == np.iinfo(np.int64).min - values[nat_mask] = 0 # Treat NaT as False + if is_datetimelike: + # Handle NaT values correctly + if self.how == "any" and mask is not None: + # For "any", we want True only if there's at least one non-NaT value + values = (~mask).astype(np.int8) # Convert mask to int8 + elif self.how == "all" and mask is not None: + # For "all", we want True only if all values are non-NaT + values = (~mask).all(axis=1, keepdims=True).astype(np.int8) + is_numeric = True + else: + values = values.view("int64") # Handle other cases appropriately elif dtype.kind == "b": values = values.view("uint8") if values.dtype == "float16": values = values.astype(np.float32) - if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) - if dtype == object: - if kwargs["skipna"]: - # GH#37501: don't raise on pd.NA when skipna=True - if mask.any(): - # mask on original values computed separately - values = values.copy() - values[mask] = True - values = values.astype(bool, copy=False).view(np.int8) - is_numeric = True - values = values.T if mask is not None: mask = mask.T From 5a3e722d683afd818286043a07a6e0912efa74d1 Mon Sep 17 00:00:00 2001 From: Petroncini Date: Tue, 24 Sep 2024 22:07:40 -0300 Subject: [PATCH 04/11] changed comments --- pandas/tests/groupby/test_grouping.py | 29 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index b8ef07b1fdeb9..94cd73d7041fd 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1182,17 +1182,26 @@ def test_grouping_by_key_is_in_axis(): tm.assert_frame_equal(result, expected) -def test_groupby_any_with_timedelta(self): - # Create a DataFrame with Timedelta and NaT values - df = DataFrame({ +def test_groupby_any_with_timedelta(): + # Create a DataFrame with Timedelta and NaT values + df = DataFrame( + { "A": ["foo", "foo", "bar", "bar"], - "B": [pd.Timedelta(1, unit='D'), pd.NaT, pd.Timedelta(2, unit='D'), pd.NaT] - }) + "B": [pd.Timedelta(1, unit="D"), pd.NaT, pd.Timedelta(2, unit="D"), pd.NaT], + } + ) - # Group by column A and check if any Timedelta exists (i.e., non-NaT) - result = df.groupby("A")["B"].any() + # Group by column A with sorting enabled and check if any Timedelta exists + result = df.groupby("A", sort=True)["B"].any() - # Expected result: groups with only NaT should return False, others should return True - expected = Series([True, False], index=["foo", "bar"], name="B") + # Corrected expected result: groups with only NaT should return False, else True + expected = Series([True, True], index=["foo", "bar"], name="B") - tm.assert_series_equal(result, expected) + # Set the expected index name to match the result + expected.index.name = "A" + + # Sort the expected result to match the order of result + expected = expected.sort_index() + + # Assert that the result matches the expected output + tm.assert_series_equal(result, expected) From ab6bf953148c3a44ca85e839ca71dbcc3bcf459d Mon Sep 17 00:00:00 2001 From: Petroncini Date: Fri, 27 Sep 2024 16:20:36 -0300 Subject: [PATCH 05/11] simpler solution --- pandas/core/groupby/ops.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1e2021dec200a..703c270132a02 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -371,24 +371,9 @@ def _call_cython_op( is_datetimelike = dtype.kind in "mM" - if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) - values = values.astype(bool, copy=False).view(np.int8) - is_numeric = True - if is_datetimelike: - # Handle NaT values correctly - if self.how == "any" and mask is not None: - # For "any", we want True only if there's at least one non-NaT value - values = (~mask).astype(np.int8) # Convert mask to int8 - elif self.how == "all" and mask is not None: - # For "all", we want True only if all values are non-NaT - values = (~mask).all(axis=1, keepdims=True).astype(np.int8) - is_numeric = True - else: - values = values.view("int64") # Handle other cases appropriately - + values = values.view("int64") + is_numeric = True elif dtype.kind == "b": values = values.view("uint8") if values.dtype == "float16": @@ -400,6 +385,19 @@ def _call_cython_op( if result_mask is not None: result_mask = result_mask.T + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if dtype == object: + if kwargs["skipna"]: + # GH#37501: don't raise on pd.NA when skipna=True + if mask.any(): + # mask on original values computed separately + values = values.copy() + values[mask] = True + values = values.astype(bool, copy=False).view(np.int8) + is_numeric = True + out_shape = self._get_output_shape(ngroups, values) func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) values = self._get_cython_vals(values) From 2cb1ef258856c5324deb8b5ea726a95583dc9dd1 Mon Sep 17 00:00:00 2001 From: Petroncini Date: Mon, 30 Sep 2024 16:06:21 -0300 Subject: [PATCH 06/11] changed order of mask=isna(values) --- pandas/core/groupby/ops.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 703c270132a02..0e99178642715 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -371,6 +371,10 @@ def _call_cython_op( is_datetimelike = dtype.kind in "mM" + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if is_datetimelike: values = values.view("int64") is_numeric = True @@ -379,25 +383,23 @@ def _call_cython_op( if values.dtype == "float16": values = values.astype(np.float32) - values = values.T - if mask is not None: - mask = mask.T - if result_mask is not None: - result_mask = result_mask.T - if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) if dtype == object: if kwargs["skipna"]: # GH#37501: don't raise on pd.NA when skipna=True - if mask.any(): + if mask is not None and mask.any(): # mask on original values computed separately values = values.copy() values[mask] = True values = values.astype(bool, copy=False).view(np.int8) is_numeric = True + values = values.T + if mask is not None: + mask = mask.T + if result_mask is not None: + result_mask = result_mask.T + out_shape = self._get_output_shape(ngroups, values) func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) values = self._get_cython_vals(values) From b233e12b0785a09a60fd380c493e9ac0b589b2c0 Mon Sep 17 00:00:00 2001 From: Petroncini Date: Mon, 30 Sep 2024 16:07:11 -0300 Subject: [PATCH 07/11] changed test --- pandas/tests/groupby/test_grouping.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 94cd73d7041fd..37c0ec3c8b4b5 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1183,25 +1183,14 @@ def test_grouping_by_key_is_in_axis(): def test_groupby_any_with_timedelta(): - # Create a DataFrame with Timedelta and NaT values - df = DataFrame( - { - "A": ["foo", "foo", "bar", "bar"], - "B": [pd.Timedelta(1, unit="D"), pd.NaT, pd.Timedelta(2, unit="D"), pd.NaT], - } - ) - - # Group by column A with sorting enabled and check if any Timedelta exists - result = df.groupby("A", sort=True)["B"].any() - - # Corrected expected result: groups with only NaT should return False, else True - expected = Series([True, True], index=["foo", "bar"], name="B") + # Create a DataFrame with a single column containing a Timedelta and NaT + df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) - # Set the expected index name to match the result - expected.index.name = "A" + # Perform groupby().any() operation + result = df.groupby(np.array([0, 1]))["value"].any() - # Sort the expected result to match the order of result - expected = expected.sort_index() + # Expected result: group with NaT should return False + expected = Series({0: True, 1: False}, name="value") - # Assert that the result matches the expected output + # Check if the result matches the expected output tm.assert_series_equal(result, expected) From 80e62251965e068b15d0020b891efaf58f4220e1 Mon Sep 17 00:00:00 2001 From: Petroncini Date: Mon, 30 Sep 2024 20:27:59 -0300 Subject: [PATCH 08/11] changed test --- pandas/tests/groupby/test_grouping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 37c0ec3c8b4b5..458f2b1b56334 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1187,7 +1187,7 @@ def test_groupby_any_with_timedelta(): df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) # Perform groupby().any() operation - result = df.groupby(np.array([0, 1]))["value"].any() + result = df.groupby([0, 1])["value"].any() # Expected result: group with NaT should return False expected = Series({0: True, 1: False}, name="value") From 8405f657a492867697c9c54e70e823ba916f778b Mon Sep 17 00:00:00 2001 From: Petroncini Date: Mon, 30 Sep 2024 20:57:44 -0300 Subject: [PATCH 09/11] handling dtype in test --- pandas/tests/groupby/test_grouping.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 458f2b1b56334..8410155137c87 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1187,10 +1187,11 @@ def test_groupby_any_with_timedelta(): df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) # Perform groupby().any() operation - result = df.groupby([0, 1])["value"].any() + result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any() # Expected result: group with NaT should return False - expected = Series({0: True, 1: False}, name="value") + expected = Series({0: True, 1: False}, name="value", dtype=bool) + expected.index = expected.index.astype(np.int64) # Check if the result matches the expected output tm.assert_series_equal(result, expected) From 034a0e9712d022c411958413a49b3c497f5795a4 Mon Sep 17 00:00:00 2001 From: Petroncini Date: Mon, 30 Sep 2024 22:21:33 -0300 Subject: [PATCH 10/11] removed redundant comment, added issue number, updated doc --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/tests/groupby/test_grouping.py | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 41ba80989a0ce..a6f12d98054f7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -652,6 +652,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) +- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59782`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 8410155137c87..6bb2eaf89b5d7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1183,15 +1183,12 @@ def test_grouping_by_key_is_in_axis(): def test_groupby_any_with_timedelta(): - # Create a DataFrame with a single column containing a Timedelta and NaT + # GH#59712 df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) - # Perform groupby().any() operation result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any() - # Expected result: group with NaT should return False expected = Series({0: True, 1: False}, name="value", dtype=bool) expected.index = expected.index.astype(np.int64) - # Check if the result matches the expected output tm.assert_series_equal(result, expected) From f5c69e9c100664a0de63be13cc9f46dbfb0eb024 Mon Sep 17 00:00:00 2001 From: Petroncini <59212480+Petroncini@users.noreply.github.com> Date: Tue, 1 Oct 2024 11:28:39 -0300 Subject: [PATCH 11/11] Update v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a6f12d98054f7..6ebb51cd3ef89 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -652,7 +652,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) -- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59782`) +- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)