From aa2115a2825955637653827a778f013aa22289f0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 11 Nov 2022 14:03:16 -0800 Subject: [PATCH 1/5] CLN: Fixture reduction --- pandas/conftest.py | 4 ++++ pandas/tests/frame/conftest.py | 22 ---------------------- pandas/tests/frame/test_reductions.py | 6 +++--- 3 files changed, 7 insertions(+), 25 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 308f63a4ebe5c..50a674242e95a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1036,7 +1036,10 @@ def all_arithmetic_functions(request): return request.param +# TODO: Dedeupe with reduction_functions in frame/conftest +# adds counts, sem _all_numeric_reductions = [ + "count", "sum", "max", "min", @@ -1047,6 +1050,7 @@ def all_arithmetic_functions(request): "median", "kurt", "skew", + "sem", ] diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 2cfa295d939a8..97cf75acbd629 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -259,25 +259,3 @@ def frame_of_index_cols(): } ) return df - - -@pytest.fixture( - params=[ - "any", - "all", - "count", - "sum", - "prod", - "max", - "min", - "mean", - "median", - "skew", - "kurt", - "sem", - "var", - "std", - ] -) -def reduction_functions(request): - return request.param diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b7474060a7e8a..5e034eaa187d9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1429,16 +1429,16 @@ def test_frame_any_with_timedelta(self): tm.assert_series_equal(result, expected) def test_reductions_skipna_none_raises( - self, request, frame_or_series, reduction_functions + self, request, frame_or_series, all_reductions ): - if reduction_functions == "count": + if all_reductions == "count": request.node.add_marker( pytest.mark.xfail(reason="Count does not accept skipna") ) obj = frame_or_series([1, 2, 3]) msg = 'For argument "skipna" expected type bool, received type NoneType.' with pytest.raises(ValueError, match=msg): - getattr(obj, reduction_functions)(skipna=None) + getattr(obj, all_reductions)(skipna=None) class TestNuisanceColumns: From d013fa8316e3b7fd5ee9f9f21025462ad11938fd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 17 Nov 2022 15:21:48 -0800 Subject: [PATCH 2/5] BUG/TST: Include sem & count in all_numeric_reductions --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/conftest.py | 2 - pandas/core/arrays/arrow/array.py | 10 ++--- pandas/tests/arrays/boolean/test_reduction.py | 4 +- pandas/tests/extension/base/reduce.py | 10 ++++- .../tests/extension/decimal/test_decimal.py | 7 +++- pandas/tests/extension/test_arrow.py | 38 +++++++++++++------ pandas/tests/extension/test_boolean.py | 8 +++- pandas/tests/extension/test_floating.py | 9 +++-- pandas/tests/extension/test_integer.py | 9 +++-- 10 files changed, 62 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index efb4a572486e3..4274e0957902f 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -636,7 +636,7 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) - Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`) -- +- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`?`) Conversion ^^^^^^^^^^ diff --git a/pandas/conftest.py b/pandas/conftest.py index 85f6872f3eaa6..81a377e498277 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1036,8 +1036,6 @@ def all_arithmetic_functions(request): return request.param -# TODO: Dedeupe with reduction_functions in frame/conftest -# adds counts, sem _all_numeric_reductions = [ "count", "sum", diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 06d91730804ab..fc7899f97cf83 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -765,13 +765,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ if name == "sem": - def pyarrow_meth(data, skipna, **kwargs): - numerator = pc.stddev(data, skip_nulls=skipna, **kwargs) - denominator = pc.sqrt_checked( - pc.subtract_checked( - pc.count(self._data, skip_nulls=skipna), kwargs["ddof"] - ) - ) + def pyarrow_meth(data, skip_nulls, **kwargs): + numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs) + denominator = pc.sqrt_checked(pc.count(self._data)) return pc.divide_checked(numerator, denominator) else: diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index f3807df929f9a..c65a287b4f215 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -50,9 +50,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if dropna: s = s.dropna() - if op == "sum": - assert isinstance(getattr(s, op)(), np.int_) - elif op == "prod": + if op in ("sum", "prod", "count"): assert isinstance(getattr(s, op)(), np.int_) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index e363fda650d52..cf161a7f4b906 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -14,8 +14,14 @@ class BaseReduceTests(BaseExtensionTests): """ def check_reduce(self, s, op_name, skipna): - result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + res_op = getattr(s, op_name) + exp_op = getattr(s.astype("float64"), op_name) + if op_name == "count": + result = res_op() + expected = exp_op() + else: + result = res_op(skipna=skipna) + expected = exp_op(skipna=skipna) tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index acba1bd557351..a49f723ea7a92 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -112,11 +112,14 @@ class TestMissing(base.BaseMissingTests): class Reduce: def check_reduce(self, s, op_name, skipna): - if op_name in ["median", "skew", "kurt"]: + if op_name in ["median", "skew", "kurt", "sem"]: msg = r"decimal does not support the .* operation" with pytest.raises(NotImplementedError, match=msg): getattr(s, op_name)(skipna=skipna) - + elif op_name == "count": + result = getattr(s, op_name)() + expected = len(s) - s.isna().sum() + tm.assert_almost_equal(result, expected) else: result = getattr(s, op_name)(skipna=skipna) expected = getattr(np.asarray(s), op_name)() diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d094a7731c417..8d854fb570f9f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -346,7 +346,10 @@ def test_getitem_scalar(self, data): class TestBaseNumericReduce(base.BaseNumericReduceTests): def check_reduce(self, ser, op_name, skipna): pa_dtype = ser.dtype.pyarrow_dtype - result = getattr(ser, op_name)(skipna=skipna) + if op_name == "count": + result = getattr(ser, op_name)() + else: + result = getattr(ser, op_name)(skipna=skipna) if pa.types.is_boolean(pa_dtype): # Can't convert if ser contains NA pytest.skip( @@ -354,7 +357,10 @@ def check_reduce(self, ser, op_name, skipna): ) elif pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): ser = ser.astype("Float64") - expected = getattr(ser, op_name)(skipna=skipna) + if op_name == "count": + expected = getattr(ser, op_name)() + else: + expected = getattr(ser, op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) @pytest.mark.parametrize("skipna", [True, False]) @@ -389,20 +395,28 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): ), ) ) - elif not ( - pa.types.is_integer(pa_dtype) - or pa.types.is_floating(pa_dtype) - or pa.types.is_boolean(pa_dtype) - ) and not ( - all_numeric_reductions in {"min", "max"} - and ( - (pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype)) - or pa.types.is_string(pa_dtype) - or pa.types.is_binary(pa_dtype) + elif ( + not ( + pa.types.is_integer(pa_dtype) + or pa.types.is_floating(pa_dtype) + or pa.types.is_boolean(pa_dtype) + ) + and not ( + all_numeric_reductions in {"min", "max"} + and ( + ( + pa.types.is_temporal(pa_dtype) + and not pa.types.is_duration(pa_dtype) + ) + or pa.types.is_string(pa_dtype) + or pa.types.is_binary(pa_dtype) + ) ) + and not all_numeric_reductions == "count" ): request.node.add_marker(xfail_mark) elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in { + "sem", "std", "var", "median", diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index b846028dab947..9646ade43e1d7 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -367,8 +367,12 @@ def test_groupby_sum_mincount(self, data_for_grouping, min_count): class TestNumericReduce(base.BaseNumericReduceTests): def check_reduce(self, s, op_name, skipna): - result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + if op_name == "count": + result = getattr(s, op_name)() + expected = getattr(s.astype("float64"), op_name)() + else: + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) # override parent function to cast to bool for min/max if np.isnan(expected): expected = pd.NA diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 0d88822009a90..580ab743a9d93 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -188,13 +188,16 @@ class TestNumericReduce(base.BaseNumericReduceTests): def check_reduce(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 - result = getattr(s, op_name)(skipna=skipna) - if not skipna and s.isna().any(): - expected = pd.NA + if op_name == "count": + result = getattr(s, op_name)() + expected = getattr(s.dropna().astype(s.dtype.numpy_dtype), op_name)() else: + result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.dropna().astype(s.dtype.numpy_dtype), op_name)( skipna=skipna ) + if not skipna and s.isna().any(): + expected = pd.NA tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index a6cf820dc7609..ba6daf4f2e189 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -211,11 +211,14 @@ class TestNumericReduce(base.BaseNumericReduceTests): def check_reduce(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 - result = getattr(s, op_name)(skipna=skipna) - if not skipna and s.isna().any(): - expected = pd.NA + if op_name == "count": + result = getattr(s, op_name)() + expected = getattr(s.dropna().astype("int64"), op_name)() else: + result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna) + if not skipna and s.isna().any(): + expected = pd.NA tm.assert_almost_equal(result, expected) From 080cb194eac49e8f1f5f9be27f0e1537c2b4c944 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 17 Nov 2022 18:22:55 -0800 Subject: [PATCH 3/5] Add xfails --- pandas/tests/arrays/boolean/test_reduction.py | 4 +++- pandas/tests/extension/test_arrow.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index c65a287b4f215..b43e5b68b411b 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -50,8 +50,10 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if dropna: s = s.dropna() - if op in ("sum", "prod", "count"): + if op in ("sum", "prod"): assert isinstance(getattr(s, op)(), np.int_) + elif op == "count": + assert isinstance(getattr(s, op)(), np.intp) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8d854fb570f9f..0a23a267b7bbf 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -380,6 +380,8 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): and pa_version_under6p0 ): request.node.add_marker(xfail_mark) + elif all_numeric_reductions == "sem" and pa_version_under8p0: + request.node.add_marker(xfail_mark) elif ( all_numeric_reductions in {"sum", "mean"} and skipna is False From 157c35c83272051daa1374518ea20b6de19f1d8f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 18 Nov 2022 11:27:00 -0800 Subject: [PATCH 4/5] Make more generic, and fix whatsnew --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/arrays/boolean/test_reduction.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 1c5750da08125..ce77e5b8359b7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -638,7 +638,7 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`) - Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`) -- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`?`) +- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`) Conversion ^^^^^^^^^^ diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index b43e5b68b411b..52b409ef8bc9a 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -53,7 +53,8 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if op in ("sum", "prod"): assert isinstance(getattr(s, op)(), np.int_) elif op == "count": - assert isinstance(getattr(s, op)(), np.intp) + # Oddly on the 32 bit build (but now Windows), this is intc (!= intp) + assert isinstance(getattr(s, op)(), np.integer) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) else: From 623ad0ef60959fc344da596d96e2d80963918e53 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 21 Nov 2022 16:48:24 -0800 Subject: [PATCH 5/5] Fix commment typo --- pandas/tests/arrays/boolean/test_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/boolean/test_reduction.py b/pandas/tests/arrays/boolean/test_reduction.py index 52b409ef8bc9a..dd8c3eda9ed05 100644 --- a/pandas/tests/arrays/boolean/test_reduction.py +++ b/pandas/tests/arrays/boolean/test_reduction.py @@ -53,7 +53,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if op in ("sum", "prod"): assert isinstance(getattr(s, op)(), np.int_) elif op == "count": - # Oddly on the 32 bit build (but now Windows), this is intc (!= intp) + # Oddly on the 32 bit build (but not Windows), this is intc (!= intp) assert isinstance(getattr(s, op)(), np.integer) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_)