From dc3c34ceb76ab6a6ff6e1ba3c3dbd80b4bda67db Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Mar 2023 08:52:34 -0700 Subject: [PATCH 1/5] BUG: converting string to numeric in median, mean --- pandas/core/nanops.py | 15 +++++++++- pandas/tests/apply/test_invalid_arg.py | 3 ++ pandas/tests/frame/test_reductions.py | 32 ++++++++++++++++------ pandas/tests/groupby/test_function.py | 6 ++++ pandas/tests/groupby/test_groupby.py | 5 +++- pandas/tests/groupby/test_raises.py | 22 +++++++++++++-- pandas/tests/resample/test_resample_api.py | 4 +-- pandas/tests/series/test_reductions.py | 31 +++++++++++++++++++++ pandas/tests/test_nanops.py | 19 +++++++++---- 9 files changed, 116 insertions(+), 21 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 43e44c7882cca..64d45b2cdb8ef 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -724,7 +724,8 @@ def nanmean( dtype_count = dtype count = _get_counts(values.shape, mask, axis, dtype=dtype_count) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) + the_sum = values.sum(axis, dtype=dtype_sum) + the_sum = _ensure_numeric(the_sum) if axis is not None and getattr(the_sum, "ndim", False): count = cast(np.ndarray, count) @@ -782,6 +783,11 @@ def get_median(x, _mask=None): values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask, fill_value=0) if not is_float_dtype(values.dtype): + if values.dtype == object: + # GH#34671 avoid casting strings to numeric + inferred = lib.infer_dtype(values) + if inferred in ["string", "mixed"]: + raise TypeError(f"Cannot convert {values} to numeric") try: values = values.astype("f8") except ValueError as err: @@ -1682,6 +1688,10 @@ def _ensure_numeric(x): if is_integer_dtype(x) or is_bool_dtype(x): x = x.astype(np.float64) elif is_object_dtype(x): + inferred = lib.infer_dtype(x) + if inferred in ["string", "mixed"]: + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert {x} to numeric") try: x = x.astype(np.complex128) except (TypeError, ValueError): @@ -1694,6 +1704,9 @@ def _ensure_numeric(x): if not np.any(np.imag(x)): x = x.real elif not (is_float(x) or is_integer(x) or is_complex(x)): + if isinstance(x, str): + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert string '{x}' to numeric") try: x = float(x) except (TypeError, ValueError): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 0207391c3070a..616cb76ca4e75 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -258,6 +258,9 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis): def test_agg_cython_table_raises_series(series, func, expected): # GH21224 msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + if func == "median" or func is np.nanmedian or func is np.median: + msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 28809e2ecb788..d89965294d395 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -170,15 +170,24 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): ): getattr(float_string_frame, opname)(axis=axis) else: - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "unsupported operand type", - "not supported between instances of", - ] - ) + if opname in ["var", "std", "sem", "skew", "kurt"]: + msg = "could not convert string to float: 'bar'" + elif opname == "product": + if axis == 1: + msg = "can't multiply sequence by non-int of type 'float'" + else: + msg = "can't multiply sequence by non-int of type 'str'" + elif opname == "sum": + msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" + elif opname == "mean": + if axis == 0: + msg = r"Could not convert \['.*'\] to numeric" + else: + msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" + elif opname in ["min", "max"]: + msg = "'[><]=' not supported between instances of 'float' and 'str'" + elif opname == "median": + msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -1724,5 +1733,10 @@ def test_fails_on_non_numeric(kernel): "argument must be a string or a real number", ] ) + if kernel == "median": + msg = ( + r"Cannot convert \[\[ " + r"\]\] to numeric" + ) with pytest.raises(TypeError, match=msg): getattr(df, kernel)(*args) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 69f68a8e95810..197e25c68ae8f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -260,6 +260,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "can't multiply sequence by non-int of type 'str'", ] ) + if method == "median": + msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)() else: @@ -277,6 +279,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", ] ) + if method == "median": + msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)(numeric_only=False) else: @@ -1465,6 +1469,8 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): "function is not implemented for this dtype", ] ) + if kernel == "median": + msg = r"Cannot convert \[ \] to numeric" with pytest.raises(exception, match=msg): method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c4c7bee2970d0..90548ab1c4407 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -628,7 +628,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"): + msg = "Could not convert string 'dullshinyshiny' to numeric" + with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -945,6 +946,8 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): # columns when numeric_only is False klass = ValueError if agg_function in ("std", "sem") else TypeError msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"]) + if agg_function == "median": + msg = r"Cannot convert \['one' 'three' 'two'\] to numeric" with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index c1403fc68c25c..a5ea683465e8e 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -147,8 +147,21 @@ def test_groupby_raises_string( "idxmin": (TypeError, "'argmin' not allowed for this dtype"), "last": (None, ""), "max": (None, ""), - "mean": (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"), - "median": (TypeError, "could not convert string to float"), + "mean": ( + TypeError, + "Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric", + ), + "median": ( + TypeError, + "|".join( + [ + r"Cannot convert \['x' 'y' 'z'\] to numeric", + r"Cannot convert \['x' 'y'\] to numeric", + r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric", + r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric", + ] + ), + ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), @@ -197,7 +210,10 @@ def test_groupby_raises_string_np( klass, msg = { np.sum: (None, ""), - np.mean: (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"), + np.mean: ( + TypeError, + "Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric", + ), }[groupby_func_np] _call_and_check(klass, msg, how, gb, groupby_func_np, tuple()) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 1a0a5dfe213bd..41a52726d5caa 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -855,8 +855,8 @@ def test_end_and_end_day_origin( ("mean", False, "Could not convert"), ("mean", lib.no_default, "Could not convert"), ("median", True, {"num": [12.5]}), - ("median", False, "could not convert"), - ("median", lib.no_default, "could not convert"), + ("median", False, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"), + ("median", lib.no_default, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"), ("std", True, {"num": [10.606601717798213]}), ("std", False, "could not convert string to float"), ("std", lib.no_default, "could not convert string to float"), diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index eb11b62a651cc..a055184edcfbb 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -129,3 +129,34 @@ def test_validate_stat_keepdims(): ) with pytest.raises(ValueError, match=msg): np.sum(ser, keepdims=True) + + +def test_mean_with_convertible_string_raises(): + # GH#44008 + ser = Series(["1", "2"]) + assert ser.sum() == "12" + msg = "Could not convert string '12' to numeric" + with pytest.raises(TypeError, match=msg): + ser.mean() + + df = ser.to_frame() + msg = r"Could not convert \['12'\] to numeric" + with pytest.raises(TypeError, match=msg): + df.mean() + + +def test_mean_dont_convert_j_to_complex(): + # GH#36703 + df = pd.DataFrame([{"db": "J", "numeric": 123}]) + msg = r"Could not convert \['J'\] to numeric" + with pytest.raises(TypeError, match=msg): + df.mean() + + with pytest.raises(TypeError, match=msg): + df.agg("mean") + + msg = "Could not convert string 'J' to numeric" + with pytest.raises(TypeError, match=msg): + df["db"].mean() + with pytest.raises(TypeError, match=msg): + np.mean(df["db"].astype("string").array) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ba21ea4e7db95..0b13d62d81440 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -898,7 +898,9 @@ def test_ndarray(self): # Test convertible string ndarray s_values = np.array(["1", "2", "3"], dtype=object) - assert np.allclose(nanops._ensure_numeric(s_values), values) + msg = r"Could not convert \['1' '2' '3'\] to numeric" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric(s_values) # Test non-convertible string ndarray s_values = np.array(["foo", "bar", "baz"], dtype=object) @@ -907,12 +909,19 @@ def test_ndarray(self): nanops._ensure_numeric(s_values) def test_convertable_values(self): - assert np.allclose(nanops._ensure_numeric("1"), 1.0) - assert np.allclose(nanops._ensure_numeric("1.1"), 1.1) - assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j) + with pytest.raises(TypeError, match="Could not convert string '1' to numeric"): + nanops._ensure_numeric("1") + with pytest.raises( + TypeError, match="Could not convert string '1.1' to numeric" + ): + nanops._ensure_numeric("1.1") + with pytest.raises( + TypeError, match=r"Could not convert string '1\+1j' to numeric" + ): + nanops._ensure_numeric("1+1j") def test_non_convertable_values(self): - msg = "Could not convert foo to numeric" + msg = "Could not convert string 'foo' to numeric" with pytest.raises(TypeError, match=msg): nanops._ensure_numeric("foo") From 76611fe42050a87a8ab3e38f6e444de51921762f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Mar 2023 09:00:04 -0700 Subject: [PATCH 2/5] whatsnew, median test --- doc/source/whatsnew/v2.1.0.rst | 3 ++- pandas/tests/series/test_reductions.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index bac567b537edc..f96d2e2701a5a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -172,7 +172,8 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) -- +- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`36703`, :issue:`44008`) +- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`34671`) Conversion ^^^^^^^^^^ diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index a055184edcfbb..b7329d843c654 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -160,3 +160,16 @@ def test_mean_dont_convert_j_to_complex(): df["db"].mean() with pytest.raises(TypeError, match=msg): np.mean(df["db"].astype("string").array) + + +def test_median_with_convertible_string_raises(): + # GH#34671 this _could_ return a string "2", but definitely not float 2.0 + msg = r"Cannot convert \['1' '2' '3'\] to numeric" + ser = Series(["1", "2", "3"]) + with pytest.raises(TypeError, match=msg): + ser.median() + + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + df = ser.to_frame() + with pytest.raises(TypeError, match=msg): + df.median() From 6e0ab8ad3cdbb023fe175c3fe650c5a38c71a353 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 29 Mar 2023 18:18:11 -0700 Subject: [PATCH 3/5] troubleshoot builds --- pandas/tests/frame/test_reductions.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index d89965294d395..f14c4a45201fb 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -181,7 +181,13 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" elif opname == "mean": if axis == 0: - msg = r"Could not convert \['.*'\] to numeric" + # different message on different builds + msg = "|".join( + [ + r"Could not convert \['.*'\] to numeric", + "Could not convert string '(bar){30}' to numeric", + ] + ) else: msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" elif opname in ["min", "max"]: @@ -1734,9 +1740,15 @@ def test_fails_on_non_numeric(kernel): ] ) if kernel == "median": - msg = ( + # slightly different message on different builds + msg1 = ( r"Cannot convert \[\[ " r"\]\] to numeric" ) + msg2 = ( + r"Cannot convert \[ " + r"\] to numeric" + ) + msg = "|".join([msg1, msg2]) with pytest.raises(TypeError, match=msg): getattr(df, kernel)(*args) From 670375f965d176d987a69697e8da4c32e1b3b13f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 30 Mar 2023 08:26:01 -0700 Subject: [PATCH 4/5] fix arraymanager build --- pandas/tests/series/test_reductions.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index b7329d843c654..0152303a7269a 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -131,7 +131,7 @@ def test_validate_stat_keepdims(): np.sum(ser, keepdims=True) -def test_mean_with_convertible_string_raises(): +def test_mean_with_convertible_string_raises(using_array_manager): # GH#44008 ser = Series(["1", "2"]) assert ser.sum() == "12" @@ -140,15 +140,19 @@ def test_mean_with_convertible_string_raises(): ser.mean() df = ser.to_frame() - msg = r"Could not convert \['12'\] to numeric" + if not using_array_manager: + msg = r"Could not convert \['12'\] to numeric" with pytest.raises(TypeError, match=msg): df.mean() -def test_mean_dont_convert_j_to_complex(): +def test_mean_dont_convert_j_to_complex(using_array_manager): # GH#36703 df = pd.DataFrame([{"db": "J", "numeric": 123}]) - msg = r"Could not convert \['J'\] to numeric" + if using_array_manager: + msg = "Could not convert string 'J' to numeric" + else: + msg = r"Could not convert \['J'\] to numeric" with pytest.raises(TypeError, match=msg): df.mean() @@ -162,14 +166,15 @@ def test_mean_dont_convert_j_to_complex(): np.mean(df["db"].astype("string").array) -def test_median_with_convertible_string_raises(): +def test_median_with_convertible_string_raises(using_array_manager): # GH#34671 this _could_ return a string "2", but definitely not float 2.0 msg = r"Cannot convert \['1' '2' '3'\] to numeric" ser = Series(["1", "2", "3"]) with pytest.raises(TypeError, match=msg): ser.median() - msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + if not using_array_manager: + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" df = ser.to_frame() with pytest.raises(TypeError, match=msg): df.median() From a763cb42ad9d38d3e54bdea12f26c0fed7c48313 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Apr 2023 14:53:21 -0700 Subject: [PATCH 5/5] say in whatsnew we raise TypeError --- doc/source/whatsnew/v2.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index db2d91d93f60b..af1e86021ef19 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -312,10 +312,10 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) -- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`36703`, :issue:`44008`) +- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`) - Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`) - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) -- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`34671`) +- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`34671`) - Conversion