From cad55a16d4e655d80261adbd02ed04aba4c06f68 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 15 Aug 2023 20:40:04 +0200 Subject: [PATCH] Backport PR #54537: REF: Refactor using_pyarrow check for string tests --- pandas/tests/strings/test_find_replace.py | 62 ++++++++--------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index b2d7e3e7507f8..bcb8db96b37fa 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -18,6 +18,10 @@ # -------------------------------------------------------------------------------------- +def using_pyarrow(dtype): + return dtype in ("string[pyarrow]",) + + def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -379,9 +383,7 @@ def test_replace_mixed_object(): def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -402,9 +404,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -424,7 +424,7 @@ def test_replace_callable_raises(any_string_dtype, repl): ) with pytest.raises(TypeError, match=msg): with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, using_pyarrow(any_string_dtype) ): values.str.replace("a", repl, regex=True) @@ -434,9 +434,7 @@ def test_replace_callable_named_groups(any_string_dtype): ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -448,16 +446,12 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -477,9 +471,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -507,9 +499,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -558,9 +548,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("A", "YYY", case=False) expected = Series( [ @@ -579,9 +567,7 @@ def test_replace_moar(any_string_dtype): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ @@ -605,16 +591,12 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -762,9 +744,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" - ): + with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -945,16 +925,16 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - using_pyarrow = any_string_dtype == "string[pyarrow]" + use_pyarrow = using_pyarrow(any_string_dtype) result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow): + with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow): result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] @@ -966,7 +946,7 @@ def test_flags_kwarg(any_string_dtype): msg = "has match groups" with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not using_pyarrow + UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow ): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0]