From dc3c34ceb76ab6a6ff6e1ba3c3dbd80b4bda67db Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 29 Mar 2023 08:52:34 -0700
Subject: [PATCH 1/5] BUG: converting string to numeric in median, mean

---
 pandas/core/nanops.py                      | 15 +++++++++-
 pandas/tests/apply/test_invalid_arg.py     |  3 ++
 pandas/tests/frame/test_reductions.py      | 32 ++++++++++++++++------
 pandas/tests/groupby/test_function.py      |  6 ++++
 pandas/tests/groupby/test_groupby.py       |  5 +++-
 pandas/tests/groupby/test_raises.py        | 22 +++++++++++++--
 pandas/tests/resample/test_resample_api.py |  4 +--
 pandas/tests/series/test_reductions.py     | 31 +++++++++++++++++++++
 pandas/tests/test_nanops.py                | 19 +++++++++----
 9 files changed, 116 insertions(+), 21 deletions(-)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 43e44c7882cca..64d45b2cdb8ef 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -724,7 +724,8 @@ def nanmean(
         dtype_count = dtype
 
     count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
-    the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
+    the_sum = values.sum(axis, dtype=dtype_sum)
+    the_sum = _ensure_numeric(the_sum)
 
     if axis is not None and getattr(the_sum, "ndim", False):
         count = cast(np.ndarray, count)
@@ -782,6 +783,11 @@ def get_median(x, _mask=None):
 
     values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask, fill_value=0)
     if not is_float_dtype(values.dtype):
+        if values.dtype == object:
+            # GH#34671 avoid casting strings to numeric
+            inferred = lib.infer_dtype(values)
+            if inferred in ["string", "mixed"]:
+                raise TypeError(f"Cannot convert {values} to numeric")
         try:
             values = values.astype("f8")
         except ValueError as err:
@@ -1682,6 +1688,10 @@ def _ensure_numeric(x):
         if is_integer_dtype(x) or is_bool_dtype(x):
             x = x.astype(np.float64)
         elif is_object_dtype(x):
+            inferred = lib.infer_dtype(x)
+            if inferred in ["string", "mixed"]:
+                # GH#44008, GH#36703 avoid casting e.g. strings to numeric
+                raise TypeError(f"Could not convert {x} to numeric")
             try:
                 x = x.astype(np.complex128)
             except (TypeError, ValueError):
@@ -1694,6 +1704,9 @@ def _ensure_numeric(x):
                 if not np.any(np.imag(x)):
                     x = x.real
     elif not (is_float(x) or is_integer(x) or is_complex(x)):
+        if isinstance(x, str):
+            # GH#44008, GH#36703 avoid casting e.g. strings to numeric
+            raise TypeError(f"Could not convert string '{x}' to numeric")
         try:
             x = float(x)
         except (TypeError, ValueError):
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
index 0207391c3070a..616cb76ca4e75 100644
--- a/pandas/tests/apply/test_invalid_arg.py
+++ b/pandas/tests/apply/test_invalid_arg.py
@@ -258,6 +258,9 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis):
 def test_agg_cython_table_raises_series(series, func, expected):
     # GH21224
     msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
+    if func == "median" or func is np.nanmedian or func is np.median:
+        msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
+
     with pytest.raises(expected, match=msg):
         # e.g. Series('a b'.split()).cumprod() will raise
         series.agg(func)
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 28809e2ecb788..d89965294d395 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -170,15 +170,24 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
         ):
             getattr(float_string_frame, opname)(axis=axis)
         else:
-            msg = "|".join(
-                [
-                    "Could not convert",
-                    "could not convert",
-                    "can't multiply sequence by non-int",
-                    "unsupported operand type",
-                    "not supported between instances of",
-                ]
-            )
+            if opname in ["var", "std", "sem", "skew", "kurt"]:
+                msg = "could not convert string to float: 'bar'"
+            elif opname == "product":
+                if axis == 1:
+                    msg = "can't multiply sequence by non-int of type 'float'"
+                else:
+                    msg = "can't multiply sequence by non-int of type 'str'"
+            elif opname == "sum":
+                msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
+            elif opname == "mean":
+                if axis == 0:
+                    msg = r"Could not convert \['.*'\] to numeric"
+                else:
+                    msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
+            elif opname in ["min", "max"]:
+                msg = "'[><]=' not supported between instances of 'float' and 'str'"
+            elif opname == "median":
+                msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S)
             with pytest.raises(TypeError, match=msg):
                 getattr(float_string_frame, opname)(axis=axis)
         if opname != "nunique":
@@ -1724,5 +1733,10 @@ def test_fails_on_non_numeric(kernel):
             "argument must be a string or a real number",
         ]
     )
+    if kernel == "median":
+        msg = (
+            r"Cannot convert \[\[<class 'object'> <class 'object'> "
+            r"<class 'object'>\]\] to numeric"
+        )
     with pytest.raises(TypeError, match=msg):
         getattr(df, kernel)(*args)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 69f68a8e95810..197e25c68ae8f 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -260,6 +260,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                     "can't multiply sequence by non-int of type 'str'",
                 ]
             )
+            if method == "median":
+                msg = r"Cannot convert \['a' 'b'\] to numeric"
             with pytest.raises(exception, match=msg):
                 getattr(gb, method)()
         else:
@@ -277,6 +279,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                     f"Cannot perform {method} with non-ordered Categorical",
                 ]
             )
+            if method == "median":
+                msg = r"Cannot convert \['a' 'b'\] to numeric"
             with pytest.raises(exception, match=msg):
                 getattr(gb, method)(numeric_only=False)
         else:
@@ -1465,6 +1469,8 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
                 "function is not implemented for this dtype",
             ]
         )
+        if kernel == "median":
+            msg = r"Cannot convert \[<class 'object'> <class 'object'>\] to numeric"
         with pytest.raises(exception, match=msg):
             method(*args, **kwargs)
     elif not has_arg and numeric_only is not lib.no_default:
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index c4c7bee2970d0..90548ab1c4407 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -628,7 +628,8 @@ def test_frame_multi_key_function_list_partial_failure():
 
     grouped = data.groupby(["A", "B"])
     funcs = [np.mean, np.std]
-    with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"):
+    msg = "Could not convert string 'dullshinyshiny' to numeric"
+    with pytest.raises(TypeError, match=msg):
         grouped.agg(funcs)
 
 
@@ -945,6 +946,8 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
         # columns when numeric_only is False
         klass = ValueError if agg_function in ("std", "sem") else TypeError
         msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
+        if agg_function == "median":
+            msg = r"Cannot convert \['one' 'three' 'two'\] to numeric"
         with pytest.raises(klass, match=msg):
             getattr(grouped, agg_function)(numeric_only=numeric_only)
     else:
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
index c1403fc68c25c..a5ea683465e8e 100644
--- a/pandas/tests/groupby/test_raises.py
+++ b/pandas/tests/groupby/test_raises.py
@@ -147,8 +147,21 @@ def test_groupby_raises_string(
         "idxmin": (TypeError, "'argmin' not allowed for this dtype"),
         "last": (None, ""),
         "max": (None, ""),
-        "mean": (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
-        "median": (TypeError, "could not convert string to float"),
+        "mean": (
+            TypeError,
+            "Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric",
+        ),
+        "median": (
+            TypeError,
+            "|".join(
+                [
+                    r"Cannot convert \['x' 'y' 'z'\] to numeric",
+                    r"Cannot convert \['x' 'y'\] to numeric",
+                    r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric",
+                    r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric",
+                ]
+            ),
+        ),
         "min": (None, ""),
         "ngroup": (None, ""),
         "nunique": (None, ""),
@@ -197,7 +210,10 @@ def test_groupby_raises_string_np(
 
     klass, msg = {
         np.sum: (None, ""),
-        np.mean: (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
+        np.mean: (
+            TypeError,
+            "Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric",
+        ),
     }[groupby_func_np]
 
     _call_and_check(klass, msg, how, gb, groupby_func_np, tuple())
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 1a0a5dfe213bd..41a52726d5caa 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -855,8 +855,8 @@ def test_end_and_end_day_origin(
         ("mean", False, "Could not convert"),
         ("mean", lib.no_default, "Could not convert"),
         ("median", True, {"num": [12.5]}),
-        ("median", False, "could not convert"),
-        ("median", lib.no_default, "could not convert"),
+        ("median", False, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
+        ("median", lib.no_default, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
         ("std", True, {"num": [10.606601717798213]}),
         ("std", False, "could not convert string to float"),
         ("std", lib.no_default, "could not convert string to float"),
diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
index eb11b62a651cc..a055184edcfbb 100644
--- a/pandas/tests/series/test_reductions.py
+++ b/pandas/tests/series/test_reductions.py
@@ -129,3 +129,34 @@ def test_validate_stat_keepdims():
     )
     with pytest.raises(ValueError, match=msg):
         np.sum(ser, keepdims=True)
+
+
+def test_mean_with_convertible_string_raises():
+    # GH#44008
+    ser = Series(["1", "2"])
+    assert ser.sum() == "12"
+    msg = "Could not convert string '12' to numeric"
+    with pytest.raises(TypeError, match=msg):
+        ser.mean()
+
+    df = ser.to_frame()
+    msg = r"Could not convert \['12'\] to numeric"
+    with pytest.raises(TypeError, match=msg):
+        df.mean()
+
+
+def test_mean_dont_convert_j_to_complex():
+    # GH#36703
+    df = pd.DataFrame([{"db": "J", "numeric": 123}])
+    msg = r"Could not convert \['J'\] to numeric"
+    with pytest.raises(TypeError, match=msg):
+        df.mean()
+
+    with pytest.raises(TypeError, match=msg):
+        df.agg("mean")
+
+    msg = "Could not convert string 'J' to numeric"
+    with pytest.raises(TypeError, match=msg):
+        df["db"].mean()
+    with pytest.raises(TypeError, match=msg):
+        np.mean(df["db"].astype("string").array)
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index ba21ea4e7db95..0b13d62d81440 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -898,7 +898,9 @@ def test_ndarray(self):
 
         # Test convertible string ndarray
         s_values = np.array(["1", "2", "3"], dtype=object)
-        assert np.allclose(nanops._ensure_numeric(s_values), values)
+        msg = r"Could not convert \['1' '2' '3'\] to numeric"
+        with pytest.raises(TypeError, match=msg):
+            nanops._ensure_numeric(s_values)
 
         # Test non-convertible string ndarray
         s_values = np.array(["foo", "bar", "baz"], dtype=object)
@@ -907,12 +909,19 @@ def test_ndarray(self):
             nanops._ensure_numeric(s_values)
 
     def test_convertable_values(self):
-        assert np.allclose(nanops._ensure_numeric("1"), 1.0)
-        assert np.allclose(nanops._ensure_numeric("1.1"), 1.1)
-        assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j)
+        with pytest.raises(TypeError, match="Could not convert string '1' to numeric"):
+            nanops._ensure_numeric("1")
+        with pytest.raises(
+            TypeError, match="Could not convert string '1.1' to numeric"
+        ):
+            nanops._ensure_numeric("1.1")
+        with pytest.raises(
+            TypeError, match=r"Could not convert string '1\+1j' to numeric"
+        ):
+            nanops._ensure_numeric("1+1j")
 
     def test_non_convertable_values(self):
-        msg = "Could not convert foo to numeric"
+        msg = "Could not convert string 'foo' to numeric"
         with pytest.raises(TypeError, match=msg):
             nanops._ensure_numeric("foo")
 

From 76611fe42050a87a8ab3e38f6e444de51921762f Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 29 Mar 2023 09:00:04 -0700
Subject: [PATCH 2/5] whatsnew, median test

---
 doc/source/whatsnew/v2.1.0.rst         |  3 ++-
 pandas/tests/series/test_reductions.py | 13 +++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index bac567b537edc..f96d2e2701a5a 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -172,7 +172,8 @@ Timezones
 Numeric
 ^^^^^^^
 - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
--
+- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`36703`, :issue:`44008`)
+- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`34671`)
 
 Conversion
 ^^^^^^^^^^
diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
index a055184edcfbb..b7329d843c654 100644
--- a/pandas/tests/series/test_reductions.py
+++ b/pandas/tests/series/test_reductions.py
@@ -160,3 +160,16 @@ def test_mean_dont_convert_j_to_complex():
         df["db"].mean()
     with pytest.raises(TypeError, match=msg):
         np.mean(df["db"].astype("string").array)
+
+
+def test_median_with_convertible_string_raises():
+    # GH#34671 this _could_ return a string "2", but definitely not float 2.0
+    msg = r"Cannot convert \['1' '2' '3'\] to numeric"
+    ser = Series(["1", "2", "3"])
+    with pytest.raises(TypeError, match=msg):
+        ser.median()
+
+    msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric"
+    df = ser.to_frame()
+    with pytest.raises(TypeError, match=msg):
+        df.median()

From 6e0ab8ad3cdbb023fe175c3fe650c5a38c71a353 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 29 Mar 2023 18:18:11 -0700
Subject: [PATCH 3/5] troubleshoot builds

---
 pandas/tests/frame/test_reductions.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index d89965294d395..f14c4a45201fb 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -181,7 +181,13 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
                 msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
             elif opname == "mean":
                 if axis == 0:
-                    msg = r"Could not convert \['.*'\] to numeric"
+                    # different message on different builds
+                    msg = "|".join(
+                        [
+                            r"Could not convert \['.*'\] to numeric",
+                            "Could not convert string '(bar){30}' to numeric",
+                        ]
+                    )
                 else:
                     msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
             elif opname in ["min", "max"]:
@@ -1734,9 +1740,15 @@ def test_fails_on_non_numeric(kernel):
         ]
     )
     if kernel == "median":
-        msg = (
+        # slightly different message on different builds
+        msg1 = (
             r"Cannot convert \[\[<class 'object'> <class 'object'> "
             r"<class 'object'>\]\] to numeric"
         )
+        msg2 = (
+            r"Cannot convert \[<class 'object'> <class 'object'> "
+            r"<class 'object'>\] to numeric"
+        )
+        msg = "|".join([msg1, msg2])
     with pytest.raises(TypeError, match=msg):
         getattr(df, kernel)(*args)

From 670375f965d176d987a69697e8da4c32e1b3b13f Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 30 Mar 2023 08:26:01 -0700
Subject: [PATCH 4/5] fix arraymanager build

---
 pandas/tests/series/test_reductions.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
index b7329d843c654..0152303a7269a 100644
--- a/pandas/tests/series/test_reductions.py
+++ b/pandas/tests/series/test_reductions.py
@@ -131,7 +131,7 @@ def test_validate_stat_keepdims():
         np.sum(ser, keepdims=True)
 
 
-def test_mean_with_convertible_string_raises():
+def test_mean_with_convertible_string_raises(using_array_manager):
     # GH#44008
     ser = Series(["1", "2"])
     assert ser.sum() == "12"
@@ -140,15 +140,19 @@ def test_mean_with_convertible_string_raises():
         ser.mean()
 
     df = ser.to_frame()
-    msg = r"Could not convert \['12'\] to numeric"
+    if not using_array_manager:
+        msg = r"Could not convert \['12'\] to numeric"
     with pytest.raises(TypeError, match=msg):
         df.mean()
 
 
-def test_mean_dont_convert_j_to_complex():
+def test_mean_dont_convert_j_to_complex(using_array_manager):
     # GH#36703
     df = pd.DataFrame([{"db": "J", "numeric": 123}])
-    msg = r"Could not convert \['J'\] to numeric"
+    if using_array_manager:
+        msg = "Could not convert string 'J' to numeric"
+    else:
+        msg = r"Could not convert \['J'\] to numeric"
     with pytest.raises(TypeError, match=msg):
         df.mean()
 
@@ -162,14 +166,15 @@ def test_mean_dont_convert_j_to_complex():
         np.mean(df["db"].astype("string").array)
 
 
-def test_median_with_convertible_string_raises():
+def test_median_with_convertible_string_raises(using_array_manager):
     # GH#34671 this _could_ return a string "2", but definitely not float 2.0
     msg = r"Cannot convert \['1' '2' '3'\] to numeric"
     ser = Series(["1", "2", "3"])
     with pytest.raises(TypeError, match=msg):
         ser.median()
 
-    msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric"
+    if not using_array_manager:
+        msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric"
     df = ser.to_frame()
     with pytest.raises(TypeError, match=msg):
         df.median()

From a763cb42ad9d38d3e54bdea12f26c0fed7c48313 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 21 Apr 2023 14:53:21 -0700
Subject: [PATCH 5/5] say in whatsnew we raise TypeError

---
 doc/source/whatsnew/v2.1.0.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index db2d91d93f60b..af1e86021ef19 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -312,10 +312,10 @@ Timezones
 Numeric
 ^^^^^^^
 - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
-- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`36703`, :issue:`44008`)
+- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`)
 - Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`)
 - Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
-- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results (:issue:`34671`)
+- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`34671`)
 -
 
 Conversion