diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index efc8bc695df85..af1e86021ef19 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -311,8 +311,11 @@ Timezones Numeric ^^^^^^^ +- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) +- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`) - Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`) - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`) +- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`34671`) - Conversion diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index ddec07c8bf890..8fddc8461dfbe 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -716,7 +716,8 @@ def nanmean( dtype_count = dtype count = _get_counts(values.shape, mask, axis, dtype=dtype_count) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) + the_sum = values.sum(axis, dtype=dtype_sum) + the_sum = _ensure_numeric(the_sum) if axis is not None and getattr(the_sum, "ndim", False): count = cast(np.ndarray, count) @@ -775,6 +776,11 @@ def get_median(x, _mask=None): dtype = values.dtype values, mask = _get_values(values, skipna, mask=mask, fill_value=0) if values.dtype.kind != "f": + if values.dtype == object: + # GH#34671 avoid casting strings to numeric + inferred = lib.infer_dtype(values) + if inferred in ["string", "mixed"]: + raise TypeError(f"Cannot convert {values} to numeric") try: values = values.astype("f8") except ValueError as err: @@ -1659,6 +1665,10 @@ def _ensure_numeric(x): if x.dtype.kind in "biu": x = x.astype(np.float64) elif x.dtype == object: + inferred = lib.infer_dtype(x) + if inferred in ["string", "mixed"]: + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert {x} to numeric") try: x = x.astype(np.complex128) except (TypeError, ValueError): @@ -1671,6 +1681,9 @@ def _ensure_numeric(x): if not np.any(np.imag(x)): x = x.real elif not (is_float(x) or is_integer(x) or is_complex(x)): + if isinstance(x, str): + # GH#44008, GH#36703 avoid casting e.g. strings to numeric + raise TypeError(f"Could not convert string '{x}' to numeric") try: x = float(x) except (TypeError, ValueError): diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 5995b78d4bea5..d75b784302676 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -244,6 +244,9 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis): def test_agg_cython_table_raises_series(series, func, expected): # GH21224 msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type" + if func == "median" or func is np.nanmedian or func is np.median: + msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" + with pytest.raises(expected, match=msg): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0d352b8e34f37..096f6fe83ea88 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -169,15 +169,30 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): ): getattr(float_string_frame, opname)(axis=axis) else: - msg = "|".join( - [ - "Could not convert", - "could not convert", - "can't multiply sequence by non-int", - "unsupported operand type", - "not supported between instances of", - ] - ) + if opname in ["var", "std", "sem", "skew", "kurt"]: + msg = "could not convert string to float: 'bar'" + elif opname == "product": + if axis == 1: + msg = "can't multiply sequence by non-int of type 'float'" + else: + msg = "can't multiply sequence by non-int of type 'str'" + elif opname == "sum": + msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" + elif opname == "mean": + if axis == 0: + # different message on different builds + msg = "|".join( + [ + r"Could not convert \['.*'\] to numeric", + "Could not convert string '(bar){30}' to numeric", + ] + ) + else: + msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'" + elif opname in ["min", "max"]: + msg = "'[><]=' not supported between instances of 'float' and 'str'" + elif opname == "median": + msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -1759,5 +1774,16 @@ def test_fails_on_non_numeric(kernel): "argument must be a string or a real number", ] ) + if kernel == "median": + # slightly different message on different builds + msg1 = ( + r"Cannot convert \[\[ " + r"\]\] to numeric" + ) + msg2 = ( + r"Cannot convert \[ " + r"\] to numeric" + ) + msg = "|".join([msg1, msg2]) with pytest.raises(TypeError, match=msg): getattr(df, kernel)(*args) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ac192f190962d..838bfc6a76497 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -262,6 +262,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "can't multiply sequence by non-int of type 'str'", ] ) + if method == "median": + msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)() else: @@ -279,6 +281,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", ] ) + if method == "median": + msg = r"Cannot convert \['a' 'b'\] to numeric" with pytest.raises(exception, match=msg): getattr(gb, method)(numeric_only=False) else: @@ -1467,6 +1471,8 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): "function is not implemented for this dtype", ] ) + if kernel == "median": + msg = r"Cannot convert \[ \] to numeric" with pytest.raises(exception, match=msg): method(*args, **kwargs) elif not has_arg and numeric_only is not lib.no_default: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b5b13d6b10511..5fd1d84219167 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -654,7 +654,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] - with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"): + msg = "Could not convert string 'dullshinyshiny' to numeric" + with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -973,6 +974,8 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): # columns when numeric_only is False klass = ValueError if agg_function in ("std", "sem") else TypeError msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"]) + if agg_function == "median": + msg = r"Cannot convert \['one' 'three' 'two'\] to numeric" with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 9b3c7543def68..55a6bc37d6046 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -147,8 +147,21 @@ def test_groupby_raises_string( "idxmin": (TypeError, "'argmin' not allowed for this dtype"), "last": (None, ""), "max": (None, ""), - "mean": (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"), - "median": (TypeError, "could not convert string to float"), + "mean": ( + TypeError, + "Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric", + ), + "median": ( + TypeError, + "|".join( + [ + r"Cannot convert \['x' 'y' 'z'\] to numeric", + r"Cannot convert \['x' 'y'\] to numeric", + r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric", + r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric", + ] + ), + ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), @@ -197,7 +210,10 @@ def test_groupby_raises_string_np( klass, msg = { np.sum: (None, ""), - np.mean: (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"), + np.mean: ( + TypeError, + "Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric", + ), }[groupby_func_np] _call_and_check(klass, msg, how, gb, groupby_func_np, tuple()) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 4b86a25f9587d..6a22faa623f69 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -857,8 +857,8 @@ def test_end_and_end_day_origin( ("mean", False, "Could not convert"), ("mean", lib.no_default, "Could not convert"), ("median", True, {"num": [12.5]}), - ("median", False, "could not convert"), - ("median", lib.no_default, "could not convert"), + ("median", False, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"), + ("median", lib.no_default, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"), ("std", True, {"num": [10.606601717798213]}), ("std", False, "could not convert string to float"), ("std", lib.no_default, "could not convert string to float"), diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index eb11b62a651cc..0152303a7269a 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -129,3 +129,52 @@ def test_validate_stat_keepdims(): ) with pytest.raises(ValueError, match=msg): np.sum(ser, keepdims=True) + + +def test_mean_with_convertible_string_raises(using_array_manager): + # GH#44008 + ser = Series(["1", "2"]) + assert ser.sum() == "12" + msg = "Could not convert string '12' to numeric" + with pytest.raises(TypeError, match=msg): + ser.mean() + + df = ser.to_frame() + if not using_array_manager: + msg = r"Could not convert \['12'\] to numeric" + with pytest.raises(TypeError, match=msg): + df.mean() + + +def test_mean_dont_convert_j_to_complex(using_array_manager): + # GH#36703 + df = pd.DataFrame([{"db": "J", "numeric": 123}]) + if using_array_manager: + msg = "Could not convert string 'J' to numeric" + else: + msg = r"Could not convert \['J'\] to numeric" + with pytest.raises(TypeError, match=msg): + df.mean() + + with pytest.raises(TypeError, match=msg): + df.agg("mean") + + msg = "Could not convert string 'J' to numeric" + with pytest.raises(TypeError, match=msg): + df["db"].mean() + with pytest.raises(TypeError, match=msg): + np.mean(df["db"].astype("string").array) + + +def test_median_with_convertible_string_raises(using_array_manager): + # GH#34671 this _could_ return a string "2", but definitely not float 2.0 + msg = r"Cannot convert \['1' '2' '3'\] to numeric" + ser = Series(["1", "2", "3"]) + with pytest.raises(TypeError, match=msg): + ser.median() + + if not using_array_manager: + msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric" + df = ser.to_frame() + with pytest.raises(TypeError, match=msg): + df.median() diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 971535bd7d783..7d258033748b6 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -850,7 +850,9 @@ def test_ndarray(self): # Test convertible string ndarray s_values = np.array(["1", "2", "3"], dtype=object) - assert np.allclose(nanops._ensure_numeric(s_values), values) + msg = r"Could not convert \['1' '2' '3'\] to numeric" + with pytest.raises(TypeError, match=msg): + nanops._ensure_numeric(s_values) # Test non-convertible string ndarray s_values = np.array(["foo", "bar", "baz"], dtype=object) @@ -859,12 +861,19 @@ def test_ndarray(self): nanops._ensure_numeric(s_values) def test_convertable_values(self): - assert np.allclose(nanops._ensure_numeric("1"), 1.0) - assert np.allclose(nanops._ensure_numeric("1.1"), 1.1) - assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j) + with pytest.raises(TypeError, match="Could not convert string '1' to numeric"): + nanops._ensure_numeric("1") + with pytest.raises( + TypeError, match="Could not convert string '1.1' to numeric" + ): + nanops._ensure_numeric("1.1") + with pytest.raises( + TypeError, match=r"Could not convert string '1\+1j' to numeric" + ): + nanops._ensure_numeric("1+1j") def test_non_convertable_values(self): - msg = "Could not convert foo to numeric" + msg = "Could not convert string 'foo' to numeric" with pytest.raises(TypeError, match=msg): nanops._ensure_numeric("foo")