From 095425ab6aff576105f048d4306154f728b14a66 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 6 Jun 2023 11:03:52 -0700 Subject: [PATCH 1/6] BUG: Fix metadata propagation in reductions --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/frame.py | 29 ++++++++++++++++++--------- pandas/core/generic.py | 10 ++++----- pandas/core/series.py | 2 +- pandas/tests/generic/test_finalize.py | 29 ++++++++++++++++----------- 5 files changed, 43 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 6bb972c21d927..64d89d452749a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -482,6 +482,7 @@ Styler Metadata ^^^^^^^^ +- Fixed metadata propagation in :meth:`DataFrame.max`,:meth:`DataFrame.min`,:meth:`DataFrame.prod`,:meth:`DataFrame.mean`, :meth:`Series.mode`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt` (:issue:`28283`) - Fixed metadata propagation in :meth:`DataFrame.std` (:issue:`28283`) Other diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1524197938a81..07dbbedaf426b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10970,7 +10970,8 @@ def min( numeric_only: bool = False, **kwargs, ): - return super().min(axis, skipna, numeric_only, **kwargs) + result = super().min(axis, skipna, numeric_only, **kwargs) + return result.__finalize__(self, method="min") @doc(make_doc("max", ndim=2)) def max( @@ -10980,7 +10981,8 @@ def max( numeric_only: bool = False, **kwargs, ): - return super().max(axis, skipna, numeric_only, **kwargs) + result = super().max(axis, skipna, numeric_only, **kwargs) + return result.__finalize__(self, method="max") @doc(make_doc("sum", ndim=2)) def sum( @@ -11003,7 +11005,8 @@ def prod( min_count: int = 0, **kwargs, ): - return super().prod(axis, skipna, numeric_only, min_count, **kwargs) + result = super().prod(axis, skipna, numeric_only, min_count, **kwargs) + return result.__finalize__(self, method="prod") @doc(make_doc("mean", ndim=2)) def mean( @@ -11013,7 +11016,8 @@ def mean( numeric_only: bool = False, **kwargs, ): - return super().mean(axis, skipna, numeric_only, **kwargs) + result = super().mean(axis, skipna, numeric_only, **kwargs) + return result.__finalize__(self, method="mean") @doc(make_doc("median", ndim=2)) def median( @@ -11023,7 +11027,8 @@ def median( numeric_only: bool = False, **kwargs, ): - return super().median(axis, skipna, numeric_only, **kwargs) + result = super().median(axis, skipna, numeric_only, **kwargs) + return result.__finalize__(self, method="median") @doc(make_doc("sem", ndim=2)) def sem( @@ -11034,7 +11039,8 @@ def sem( numeric_only: bool = False, **kwargs, ): - return super().sem(axis, skipna, ddof, numeric_only, **kwargs) + result = super().sem(axis, skipna, ddof, numeric_only, **kwargs) + return result.__finalize__(self, method="sem") @doc(make_doc("var", ndim=2)) def var( @@ -11045,7 +11051,8 @@ def var( numeric_only: bool = False, **kwargs, ): - return super().var(axis, skipna, ddof, numeric_only, **kwargs) + result = super().var(axis, skipna, ddof, numeric_only, **kwargs) + return result.__finalize__(self, method="var") @doc(make_doc("std", ndim=2)) def std( @@ -11056,7 +11063,7 @@ def std( numeric_only: bool = False, **kwargs, ): - result = cast(Series, super().std(axis, skipna, ddof, numeric_only, **kwargs)) + result = super().std(axis, skipna, ddof, numeric_only, **kwargs) return result.__finalize__(self, method="std") @doc(make_doc("skew", ndim=2)) @@ -11067,7 +11074,8 @@ def skew( numeric_only: bool = False, **kwargs, ): - return super().skew(axis, skipna, numeric_only, **kwargs) + result = super().skew(axis, skipna, numeric_only, **kwargs) + return result.__finalize__(self, method="skew") @doc(make_doc("kurt", ndim=2)) def kurt( @@ -11077,7 +11085,8 @@ def kurt( numeric_only: bool = False, **kwargs, ): - return super().kurt(axis, skipna, numeric_only, **kwargs) + result = super().kurt(axis, skipna, numeric_only, **kwargs) + return result.__finalize__(self, method="kurt") kurtosis = kurt product = prod diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 90a0444872ec7..47b6baf114727 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11330,7 +11330,7 @@ def _logical_func( skipna=skipna, numeric_only=bool_only, filter_type="bool", - ) + ).__finalize__(self) def any( self, @@ -11449,7 +11449,7 @@ def sem( ) -> Series | float: return self._stat_function_ddof( "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs - ) + ).__finalize__(self, method="max") def var( self, @@ -11461,7 +11461,7 @@ def var( ) -> Series | float: return self._stat_function_ddof( "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs - ) + ).__finalize__(self, method="var") def std( self, @@ -11473,7 +11473,7 @@ def std( ) -> Series | float: return self._stat_function_ddof( "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs - ) + ).__finalize__(self, method="std") @final def _stat_function( @@ -11639,7 +11639,7 @@ def prod( numeric_only, min_count, **kwargs, - ) + ).__finalize__(self, method="prod") product = prod diff --git a/pandas/core/series.py b/pandas/core/series.py index 41a32cb60c39f..b3bdd33ae3b68 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2164,7 +2164,7 @@ def mode(self, dropna: bool = True) -> Series: # Ensure index is type stable (should always use int index) return self._constructor( res_values, index=range(len(res_values)), name=self.name, copy=False - ) + ).__finalize__(self, method="mode") def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation """ diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 9dfa2c8a5a90a..b5773f39efcd5 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -188,10 +188,8 @@ (pd.DataFrame, frame_data, operator.methodcaller("idxmin")), (pd.DataFrame, frame_data, operator.methodcaller("idxmax")), (pd.DataFrame, frame_data, operator.methodcaller("mode")), - pytest.param( - (pd.Series, [0], operator.methodcaller("mode")), - marks=not_implemented_mark, - ), + (pd.Series, [0], operator.methodcaller("mode")), + (pd.DataFrame, frame_data, operator.methodcaller("median")), pytest.param( ( pd.DataFrame, @@ -397,21 +395,28 @@ # Cumulative reductions (pd.Series, ([1],), operator.methodcaller("cumsum")), (pd.DataFrame, frame_data, operator.methodcaller("cumsum")), + (pd.Series, ([1],), operator.methodcaller("cummin")), + (pd.DataFrame, frame_data, operator.methodcaller("cummin")), + (pd.Series, ([1],), operator.methodcaller("cummax")), + (pd.DataFrame, frame_data, operator.methodcaller("cummax")), + (pd.Series, ([1],), operator.methodcaller("cumprod")), + (pd.DataFrame, frame_data, operator.methodcaller("cumprod")), # Reductions - pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("any")), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_data, operator.methodcaller("any")), + (pd.DataFrame, frame_data, operator.methodcaller("all")), + (pd.DataFrame, frame_data, operator.methodcaller("min")), + (pd.DataFrame, frame_data, operator.methodcaller("max")), pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("sum")), ), pytest.param( (pd.DataFrame, frame_data, operator.methodcaller("std")), ), - pytest.param( - (pd.DataFrame, frame_data, operator.methodcaller("mean")), - marks=not_implemented_mark, - ), + (pd.DataFrame, frame_data, operator.methodcaller("mean")), + (pd.DataFrame, frame_data, operator.methodcaller("prod")), + (pd.DataFrame, frame_data, operator.methodcaller("sem")), + (pd.DataFrame, frame_data, operator.methodcaller("skew")), + (pd.DataFrame, frame_data, operator.methodcaller("kurt")), ] From 8bc0b4558bcdad331649f91b451840e76cde20ca Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 6 Jun 2023 12:12:54 -0700 Subject: [PATCH 2/6] fix tests --- pandas/core/frame.py | 6 ++++-- pandas/core/generic.py | 10 +++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 07dbbedaf426b..70025fea7b9ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10944,9 +10944,10 @@ def any( # type: ignore[override] ) -> Series: # error: Incompatible return value type (got "Union[Series, bool]", # expected "Series") - return self._logical_func( # type: ignore[return-value] + result = self._logical_func( # type: ignore[return-value] "any", nanops.nanany, axis, bool_only, skipna, **kwargs ) + return result.__finalize__(self, method="any") @doc(make_doc("all", ndim=2)) def all( @@ -10958,9 +10959,10 @@ def all( ) -> Series: # error: Incompatible return value type (got "Union[Series, bool]", # expected "Series") - return self._logical_func( # type: ignore[return-value] + result = self._logical_func( # type: ignore[return-value] "all", nanops.nanall, axis, bool_only, skipna, **kwargs ) + return result.__finalize__(self, method="all") @doc(make_doc("min", ndim=2)) def min( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 47b6baf114727..90a0444872ec7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11330,7 +11330,7 @@ def _logical_func( skipna=skipna, numeric_only=bool_only, filter_type="bool", - ).__finalize__(self) + ) def any( self, @@ -11449,7 +11449,7 @@ def sem( ) -> Series | float: return self._stat_function_ddof( "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs - ).__finalize__(self, method="max") + ) def var( self, @@ -11461,7 +11461,7 @@ def var( ) -> Series | float: return self._stat_function_ddof( "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs - ).__finalize__(self, method="var") + ) def std( self, @@ -11473,7 +11473,7 @@ def std( ) -> Series | float: return self._stat_function_ddof( "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs - ).__finalize__(self, method="std") + ) @final def _stat_function( @@ -11639,7 +11639,7 @@ def prod( numeric_only, min_count, **kwargs, - ).__finalize__(self, method="prod") + ) product = prod From ad14f9a7d004d27f2185554826ed749e7b60c7e9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 6 Jun 2023 13:01:03 -0700 Subject: [PATCH 3/6] actually fix tests --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/frame.py | 45 +++++++++++++++++++++------------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 64d89d452749a..b139b2e50e2de 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -482,7 +482,7 @@ Styler Metadata ^^^^^^^^ -- Fixed metadata propagation in :meth:`DataFrame.max`,:meth:`DataFrame.min`,:meth:`DataFrame.prod`,:meth:`DataFrame.mean`, :meth:`Series.mode`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt` (:issue:`28283`) +- Fixed metadata propagation in :meth:`DataFrame.max`, :meth:`DataFrame.min`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`Series.mode`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt` (:issue:`28283`) - Fixed metadata propagation in :meth:`DataFrame.std` (:issue:`28283`) Other diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70025fea7b9ed..83370796a4573 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10933,21 +10933,20 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: return res_ser @doc(make_doc("any", ndim=2)) - # error: Signature of "any" incompatible with supertype "NDFrame" - def any( # type: ignore[override] + def any( self, *, axis: Axis = 0, bool_only: bool = False, skipna: bool = True, **kwargs, - ) -> Series: - # error: Incompatible return value type (got "Union[Series, bool]", - # expected "Series") + ) -> Series | bool: result = self._logical_func( # type: ignore[return-value] "any", nanops.nanany, axis, bool_only, skipna, **kwargs ) - return result.__finalize__(self, method="any") + if isinstance(result, Series): + result = result.__finalize__(self, method="any") + return result @doc(make_doc("all", ndim=2)) def all( @@ -10956,13 +10955,13 @@ def all( bool_only: bool = False, skipna: bool = True, **kwargs, - ) -> Series: - # error: Incompatible return value type (got "Union[Series, bool]", - # expected "Series") - result = self._logical_func( # type: ignore[return-value] + ) -> Series | bool: + result = self._logical_func( "all", nanops.nanall, axis, bool_only, skipna, **kwargs ) - return result.__finalize__(self, method="all") + if isinstance(result, Series): + result = result.__finalize__(self, method="all") + return result @doc(make_doc("min", ndim=2)) def min( @@ -10973,7 +10972,9 @@ def min( **kwargs, ): result = super().min(axis, skipna, numeric_only, **kwargs) - return result.__finalize__(self, method="min") + if isinstance(result, Series): + result = result.__finalize__(self, method="min") + return result @doc(make_doc("max", ndim=2)) def max( @@ -10984,7 +10985,9 @@ def max( **kwargs, ): result = super().max(axis, skipna, numeric_only, **kwargs) - return result.__finalize__(self, method="max") + if isinstance(result, Series): + result = result.__finalize__(self, method="max") + return result @doc(make_doc("sum", ndim=2)) def sum( @@ -11019,7 +11022,9 @@ def mean( **kwargs, ): result = super().mean(axis, skipna, numeric_only, **kwargs) - return result.__finalize__(self, method="mean") + if isinstance(result, Series): + result = result.__finalize__(self, method="mean") + return result @doc(make_doc("median", ndim=2)) def median( @@ -11030,7 +11035,9 @@ def median( **kwargs, ): result = super().median(axis, skipna, numeric_only, **kwargs) - return result.__finalize__(self, method="median") + if isinstance(result, Series): + result = result.__finalize__(self, method="median") + return result @doc(make_doc("sem", ndim=2)) def sem( @@ -11077,7 +11084,9 @@ def skew( **kwargs, ): result = super().skew(axis, skipna, numeric_only, **kwargs) - return result.__finalize__(self, method="skew") + if isinstance(result, Series): + result = result.__finalize__(self, method="skew") + return result @doc(make_doc("kurt", ndim=2)) def kurt( @@ -11088,7 +11097,9 @@ def kurt( **kwargs, ): result = super().kurt(axis, skipna, numeric_only, **kwargs) - return result.__finalize__(self, method="kurt") + if isinstance(result, Series): + result = result.__finalize__(self, method="kurt") + return result kurtosis = kurt product = prod From f51e7f429eba763db39cf5e1296eac8a41308307 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 7 Jun 2023 17:19:18 -0700 Subject: [PATCH 4/6] fix typing --- pandas/core/frame.py | 17 ++++++++++++----- pandas/core/reshape/encoding.py | 7 +++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f969c06d58010..71b2b948a7ab8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10932,7 +10932,8 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: return res_ser @doc(make_doc("any", ndim=2)) - def any( + # error: Signature of "any" incompatible with supertype "NDFrame" + def any( # type: ignore[override] self, *, axis: Axis = 0, @@ -10940,7 +10941,7 @@ def any( skipna: bool = True, **kwargs, ) -> Series | bool: - result = self._logical_func( # type: ignore[return-value] + result = self._logical_func( "any", nanops.nanany, axis, bool_only, skipna, **kwargs ) if isinstance(result, Series): @@ -11048,7 +11049,9 @@ def sem( **kwargs, ): result = super().sem(axis, skipna, ddof, numeric_only, **kwargs) - return result.__finalize__(self, method="sem") + if isinstance(result, Series): + result = result.__finalize__(self, method="sem") + return result @doc(make_doc("var", ndim=2)) def var( @@ -11060,7 +11063,9 @@ def var( **kwargs, ): result = super().var(axis, skipna, ddof, numeric_only, **kwargs) - return result.__finalize__(self, method="var") + if isinstance(result, Series): + result = result.__finalize__(self, method="var") + return result @doc(make_doc("std", ndim=2)) def std( @@ -11072,7 +11077,9 @@ def std( **kwargs, ): result = super().std(axis, skipna, ddof, numeric_only, **kwargs) - return result.__finalize__(self, method="std") + if isinstance(result, Series): + result = result.__finalize__(self, method="std") + return result @doc(make_doc("skew", ndim=2)) def skew( diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 58209c357b65d..98eb8c05761a0 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Hashable, Iterable, + cast, ) import numpy as np @@ -455,10 +456,12 @@ def from_dummies( f"Received 'data' of type: {type(data).__name__}" ) - if data.isna().any().any(): + col_isna_mask = cast(Series, data.isna().any()) + + if col_isna_mask.any(): raise ValueError( "Dummy DataFrame contains NA value in column: " - f"'{data.isna().any().idxmax()}'" + f"'{col_isna_mask.idxmax()}'" ) # index data with a list of all columns that are dummies From 179694ac07dee1758f47f163e8651ffecfbc1414 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 31 Jul 2023 16:19:31 -0700 Subject: [PATCH 5/6] Update pandas/core/reshape/encoding.py --- pandas/core/reshape/encoding.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index d5590a6696a60..7208c82173ab6 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -4,10 +4,13 @@ from collections.abc import ( Hashable, Iterable, - cast, + ) import itertools -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + cast, +) import numpy as np From cfe17bbd6f4cda792e1c6c77e7f8a15e3a1c8141 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 1 Aug 2023 00:51:31 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/reshape/encoding.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index 7208c82173ab6..e30881e1a79c6 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -4,7 +4,6 @@ from collections.abc import ( Hashable, Iterable, - ) import itertools from typing import (