BUG: Complete str accessor methods (#157)

bashtage · Kevin Sheppard · web-flow · commit 32db1769f5a4 · 2022-07-22T08:46:20.000-04:00
* BUG: Complete str accessor methods Add remaining methods closes #155 * MAINT: Fix unused import * BUG/ENH: Clean string accessor methods Remove invalid methods Correct all types * TST: Add types for testing * TST: Add many tests * TST: Test string accessor overloads Test overloads for correctness * TYP: Final typing for string accessor * CLN: Simplify overload and add test Simplify overload Add test for other forms Change return type to Series * CLN: Clean up after rebase Co-authored-by: Kevin Sheppard <kevin.sheppard@gmail.com>
diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi
@@ -1,104 +1,172 @@
 from __future__ import annotations
 
-from typing import Generic
+import re
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Literal,
+    Sequence,
+    overload,
+)
 
+import numpy as np
+import pandas as pd
 from pandas import Series
 from pandas.core.base import NoNewAttributesMixin as NoNewAttributesMixin
 
-from pandas._typing import T
+from pandas._typing import (
+    F,
+    T,
+)
 
-def cat_core(list_of_columns: list, sep: str): ...
-def cat_safe(list_of_columns: list, sep: str): ...
-def str_count(arr, pat, flags: int = ...): ...
-def str_contains(
-    arr, pat, case: bool = ..., flags: int = ..., na=..., regex: bool = ...
-): ...
-def str_startswith(arr, pat, na=...): ...
-def str_endswith(arr, pat, na=...): ...
-def str_replace(
-    arr, pat, repl, n: int = ..., case=..., flags: int = ..., regex: bool = ...
-): ...
-def str_repeat(arr, repeats): ...
-def str_match(arr, pat, case: bool = ..., flags: int = ..., na=...): ...
-def str_extract(arr, pat, flags: int = ..., expand: bool = ...): ...
-def str_extractall(arr, pat, flags: int = ...): ...
-def str_get_dummies(arr, sep: str = ...): ...
-def str_join(arr, sep): ...
-def str_findall(arr, pat, flags: int = ...): ...
-def str_find(arr, sub, start: int = ..., end=..., side: str = ...): ...
-def str_index(arr, sub, start: int = ..., end=..., side: str = ...): ...
-def str_pad(arr, width, side: str = ..., fillchar: str = ...): ...
-def str_split(arr, pat=..., n=...): ...
-def str_rsplit(arr, pat=..., n=...): ...
-def str_slice(arr, start=..., stop=..., step=...): ...
-def str_slice_replace(arr, start=..., stop=..., repl=...): ...
-def str_strip(arr, to_strip=..., side: str = ...): ...
-def str_wrap(arr, width, **kwargs): ...
-def str_translate(arr, table): ...
-def str_get(arr, i): ...
-def str_decode(arr, encoding, errors: str = ...): ...
-def str_encode(arr, encoding, errors: str = ...): ...
-def forbid_nonstring_types(forbidden, name=...): ...
-def copy(source): ...
+def cat_core(list_of_columns: list[np.ndarray], sep: str) -> np.ndarray: ...
+def cat_safe(list_of_columns: list[np.ndarray], sep: str) -> np.ndarray: ...
+def forbid_nonstring_types(
+    forbidden: list[str] | None, name: str | None = ...
+) -> Callable[[F], F]: ...
 
 class StringMethods(NoNewAttributesMixin, Generic[T]):
-    def __init__(self, data) -> None: ...
-    def __getitem__(self, key) -> T: ...
-    def __iter__(self): ...
-    def cat(self, others=..., sep=..., na_rep=..., join: str = ...) -> T: ...
-    def split(self, pat=..., n: int = ..., expand: bool = ...) -> T: ...
-    def rsplit(self, pat=..., n: int = ..., expand: bool = ...) -> T: ...
-    def partition(self, sep: str = ..., expand: bool = ...) -> T: ...
-    def rpartition(self, sep: str = ..., expand: bool = ...) -> T: ...
-    def get(self, i) -> T: ...
-    def join(self, sep) -> T: ...
+    def __init__(self, data: T) -> None: ...
+    def __getitem__(self, key: slice | int) -> T: ...
+    def __iter__(self) -> T: ...
+    @overload
+    def cat(
+        self,
+        *,
+        sep: str,
+        na_rep: str | None = ...,
+        join: Literal["left", "right", "outer", "inner"] = ...,
+    ) -> str: ...
+    @overload
+    def cat(
+        self,
+        others: Literal[None] = ...,
+        *,
+        sep: str,
+        na_rep: str | None = ...,
+        join: Literal["left", "right", "outer", "inner"] = ...,
+    ) -> str: ...
+    @overload
+    def cat(
+        self,
+        others: Series | pd.Index | pd.DataFrame | np.ndarray | list[Any],
+        sep: str = ...,
+        na_rep: str | None = ...,
+        join: Literal["left", "right", "outer", "inner"] = ...,
+    ) -> T: ...
+    def split(
+        self, pat: str = ..., n: int = ..., expand: bool = ..., *, regex: bool = ...
+    ) -> T: ...
+    def rsplit(
+        self, pat: str = ..., n: int = ..., expand: bool = ..., *, regex: bool = ...
+    ) -> T: ...
+    @overload
+    def partition(self, sep: str = ...) -> pd.DataFrame: ...
+    @overload
+    def partition(self, *, expand: Literal[True]) -> pd.DataFrame: ...
+    @overload
+    def partition(self, sep: str, expand: Literal[True]) -> pd.DataFrame: ...
+    @overload
+    def partition(self, sep: str, expand: Literal[False]) -> T: ...
+    @overload
+    def partition(self, *, expand: Literal[False]) -> T: ...
+    @overload
+    def rpartition(self, sep: str = ...) -> pd.DataFrame: ...
+    @overload
+    def rpartition(self, *, expand: Literal[True]) -> pd.DataFrame: ...
+    @overload
+    def rpartition(self, sep: str, expand: Literal[True]) -> pd.DataFrame: ...
+    @overload
+    def rpartition(self, sep: str, expand: Literal[False]) -> T: ...
+    @overload
+    def rpartition(self, *, expand: Literal[False]) -> T: ...
+    def get(self, i: int) -> T: ...
+    def join(self, sep: str) -> T: ...
     def contains(
-        self, pat, case: bool = ..., flags: int = ..., na=..., regex: bool = ...
+        self, pat: str, case: bool = ..., flags: int = ..., na=..., regex: bool = ...
     ) -> Series[bool]: ...
-    def match(self, pat, case: bool = ..., flags: int = ..., na=...) -> T: ...
+    def match(
+        self, pat: str, case: bool = ..., flags: int = ..., na: Any = ...
+    ) -> T: ...
     def replace(
-        self, pat, repl, n: int = ..., case=..., flags: int = ..., regex: bool = ...
+        self,
+        pat: str,
+        repl: str | Callable[[re.Match], str],
+        n: int = ...,
+        case: bool | None = ...,
+        flags: int = ...,
+        regex: bool = ...,
+    ) -> T: ...
+    def repeat(self, repeats: int | Sequence[int]) -> T: ...
+    def pad(
+        self,
+        width: int,
+        side: Literal["left", "right", "both"] = ...,
+        fillchar: str = ...,
     ) -> T: ...
-    def repeat(self, repeats) -> T: ...
-    def pad(self, width, side: str = ..., fillchar: str = ...) -> T: ...
-    def center(self, width, fillchar: str = ...) -> T: ...
-    def ljust(self, width, fillchar: str = ...) -> T: ...
-    def rjust(self, width, fillchar: str = ...) -> T: ...
-    def zfill(self, width) -> T: ...
-    def slice(self, start=..., stop=..., step=...) -> T: ...
-    def slice_replace(self, start=..., stop=..., repl=...) -> T: ...
-    def decode(self, encoding, errors: str = ...) -> T: ...
-    def encode(self, encoding, errors: str = ...) -> T: ...
-    def strip(self, to_strip=...) -> T: ...
-    def lstrip(self, to_strip=...) -> T: ...
-    def rstrip(self, to_strip=...) -> T: ...
-    def wrap(self, width, **kwargs) -> T: ...
-    def get_dummies(self, sep: str = ...) -> T: ...
-    def translate(self, table) -> T: ...
-    count = ...
-    startswith = ...
-    endswith = ...
-    findall = ...
-    def extract(self, pat, flags: int = ..., expand: bool = ...) -> T: ...
-    def extractall(self, pat, flags: int = ...) -> T: ...
-    def find(self, sub, start: int = ..., end=...) -> T: ...
-    def rfind(self, sub, start: int = ..., end=...) -> T: ...
-    def normalize(self, form) -> T: ...
-    def index(self, sub, start: int = ..., end=...) -> T: ...
-    def rindex(self, sub, start: int = ..., end=...) -> T: ...
-    len = ...
-    lower = ...
-    upper = ...
-    title = ...
-    capitalize = ...
-    swapcase = ...
-    casefold = ...
-    isalnum = ...
-    isalpha = ...
-    isdigit = ...
-    isspace = ...
-    islower = ...
-    isupper = ...
-    istitle = ...
-    isnumeric = ...
-    isdecimal = ...
+    def center(self, width: int, fillchar: str = ...) -> T: ...
+    def ljust(self, width: int, fillchar: str = ...) -> T: ...
+    def rjust(self, width: int, fillchar: str = ...) -> T: ...
+    def zfill(self, width: int) -> T: ...
+    def slice(
+        self, start: int | None = ..., stop: int | None = ..., step: int | None = ...
+    ) -> T: ...
+    def slice_replace(
+        self, start: int | None = ..., stop: int | None = ..., repl: str | None = ...
+    ) -> T: ...
+    def decode(self, encoding: str, errors: str = ...) -> T: ...
+    def encode(self, encoding: str, errors: str = ...) -> T: ...
+    def strip(self, to_strip: str | None = ...) -> T: ...
+    def lstrip(self, to_strip: str | None = ...) -> T: ...
+    def rstrip(self, to_strip: str | None = ...) -> T: ...
+    def wrap(
+        self,
+        width: int,
+        expand_tabs: bool | None = ...,
+        replace_whitespace: bool | None = ...,
+        drop_whitespace: bool | None = ...,
+        break_long_words: bool | None = ...,
+        break_on_hyphens: bool | None = ...,
+    ) -> T: ...
+    def get_dummies(self, sep: str = ...) -> pd.DataFrame: ...
+    def translate(self, table: dict[int, int | str | None] | None) -> T: ...
+    def count(self, pat: str, flags: int = ...) -> Series[int]: ...
+    def startswith(self, pat: str, na: Any = ...) -> Series[bool]: ...
+    def endswith(self, pat: str, na: Any = ...) -> Series[bool]: ...
+    def findall(self, pat: str, flags: int = ...) -> Series: ...
+    @overload
+    def extract(
+        self, pat: str, flags: int = ..., *, expand: Literal[True] = ...
+    ) -> pd.DataFrame: ...
+    @overload
+    def extract(self, pat: str, flags: int, expand: Literal[False]) -> T: ...
+    @overload
+    def extract(self, pat: str, flags: int = ..., *, expand: Literal[False]) -> T: ...
+    def extractall(self, pat: str, flags: int = ...) -> pd.DataFrame: ...
+    def find(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
+    def rfind(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
+    def normalize(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> T: ...
+    def index(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
+    def rindex(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
+    def len(self) -> Series[int]: ...
+    def lower(self) -> T: ...
+    def upper(self) -> T: ...
+    def title(self) -> T: ...
+    def capitalize(self) -> T: ...
+    def swapcase(self) -> T: ...
+    def casefold(self) -> T: ...
+    def isalnum(self) -> Series[bool]: ...
+    def isalpha(self) -> Series[bool]: ...
+    def isdigit(self) -> Series[bool]: ...
+    def isspace(self) -> Series[bool]: ...
+    def islower(self) -> Series[bool]: ...
+    def isupper(self) -> Series[bool]: ...
+    def istitle(self) -> Series[bool]: ...
+    def isnumeric(self) -> Series[bool]: ...
+    def isdecimal(self) -> Series[bool]: ...
+    def fullmatch(
+        self, pat: str, case: bool = ..., flags: int = ..., na: Any = ...
+    ) -> Series[bool]: ...
+    def removeprefix(self, prefix: str) -> T: ...
+    def removesuffix(self, suffix: str) -> T: ...
diff --git a/tests/test_series.py b/tests/test_series.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+import re
 import tempfile
 from typing import (
     TYPE_CHECKING,
@@ -831,3 +832,119 @@ def test_categorical_codes():
     # GH-111
     cat = pd.Categorical(["a", "b", "a"])
     assert_type(cat.codes, "np_ndarray_int")
+
+
+def test_string_accessors():
+    s = pd.Series(
+        ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"]
+    )
+    s2 = pd.Series([["apple", "banana"], ["cherry", "date"], [1, "eggplant"]])
+    s3 = pd.Series(["a1", "b2", "c3"])
+    check(assert_type(s.str.capitalize(), pd.Series), pd.Series)
+    check(assert_type(s.str.casefold(), pd.Series), pd.Series)
+    check(assert_type(s.str.cat(sep="X"), str), str)
+    check(assert_type(s.str.center(10), pd.Series), pd.Series)
+    check(assert_type(s.str.contains("a"), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.count("pp"), "pd.Series[int]"), pd.Series, int)
+    check(assert_type(s.str.decode("utf-8"), pd.Series), pd.Series)
+    check(assert_type(s.str.encode("latin-1"), pd.Series), pd.Series)
+    check(assert_type(s.str.endswith("e"), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s3.str.extract(r"([ab])?(\d)"), pd.DataFrame), pd.DataFrame)
+    check(assert_type(s3.str.extractall(r"([ab])?(\d)"), pd.DataFrame), pd.DataFrame)
+    check(assert_type(s.str.find("p"), pd.Series), pd.Series)
+    check(assert_type(s.str.findall("pp"), pd.Series), pd.Series)
+    check(assert_type(s.str.fullmatch("apple"), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.get(2), pd.Series), pd.Series)
+    check(assert_type(s.str.get_dummies(), pd.DataFrame), pd.DataFrame)
+    check(assert_type(s.str.index("p"), pd.Series), pd.Series)
+    check(assert_type(s.str.isalnum(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.isalpha(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.isdecimal(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.isdigit(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.isnumeric(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.islower(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.isspace(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.istitle(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.isupper(), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s2.str.join("-"), pd.Series), pd.Series)
+    check(assert_type(s.str.len(), "pd.Series[int]"), pd.Series, int)
+    check(assert_type(s.str.ljust(80), pd.Series), pd.Series)
+    check(assert_type(s.str.lower(), pd.Series), pd.Series)
+    check(assert_type(s.str.lstrip("a"), pd.Series), pd.Series)
+    check(assert_type(s.str.match("pp"), pd.Series), pd.Series)
+    check(assert_type(s.str.normalize("NFD"), pd.Series), pd.Series)
+    check(assert_type(s.str.pad(80, "right"), pd.Series), pd.Series)
+    check(assert_type(s.str.partition("p"), pd.DataFrame), pd.DataFrame)
+    check(assert_type(s.str.removeprefix("a"), pd.Series), pd.Series)
+    check(assert_type(s.str.removesuffix("e"), pd.Series), pd.Series)
+    check(assert_type(s.str.repeat(2), pd.Series), pd.Series)
+    check(assert_type(s.str.replace("a", "X"), pd.Series), pd.Series)
+    check(assert_type(s.str.rfind("e"), pd.Series), pd.Series)
+    check(assert_type(s.str.rindex("p"), pd.Series), pd.Series)
+    check(assert_type(s.str.rjust(80), pd.Series), pd.Series)
+    check(assert_type(s.str.rpartition("p"), pd.DataFrame), pd.DataFrame)
+    check(assert_type(s.str.rsplit("a"), pd.Series), pd.Series)
+    check(assert_type(s.str.rstrip(), pd.Series), pd.Series)
+    check(assert_type(s.str.slice(0, 4, 2), pd.Series), pd.Series)
+    check(assert_type(s.str.slice_replace(0, 2, "XX"), pd.Series), pd.Series)
+    check(assert_type(s.str.split("a"), pd.Series), pd.Series)
+    check(assert_type(s.str.startswith("a"), "pd.Series[bool]"), pd.Series, bool)
+    check(assert_type(s.str.strip(), pd.Series), pd.Series)
+    check(assert_type(s.str.swapcase(), pd.Series), pd.Series)
+    check(assert_type(s.str.title(), pd.Series), pd.Series)
+    check(assert_type(s.str.translate(None), pd.Series), pd.Series)
+    check(assert_type(s.str.upper(), pd.Series), pd.Series)
+    check(assert_type(s.str.wrap(80), pd.Series), pd.Series)
+    check(assert_type(s.str.zfill(10), pd.Series), pd.Series)
+
+
+def test_series_overloads_cat():
+    s = pd.Series(
+        ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"]
+    )
+    check(assert_type(s.str.cat(sep=";"), str), str)
+    check(assert_type(s.str.cat(None, sep=";"), str), str)
+    check(
+        assert_type(s.str.cat(["A", "B", "C", "D", "E", "F", "G"], sep=";"), pd.Series),
+        pd.Series,
+    )
+
+
+def test_series_overloads_partition():
+    s = pd.Series(
+        [
+            "ap;pl;ep",
+            "ban;an;ap",
+            "Che;rr;yp",
+            "DA;TEp",
+            "eGGp;LANT;p",
+            "12;3p",
+            "23.45p",
+        ]
+    )
+    check(assert_type(s.str.partition(sep=";"), pd.DataFrame), pd.DataFrame)
+    check(
+        assert_type(s.str.partition(sep=";", expand=True), pd.DataFrame), pd.DataFrame
+    )
+    check(assert_type(s.str.partition(sep=";", expand=False), pd.Series), pd.Series)
+
+    check(assert_type(s.str.rpartition(sep=";"), pd.DataFrame), pd.DataFrame)
+    check(
+        assert_type(s.str.rpartition(sep=";", expand=True), pd.DataFrame), pd.DataFrame
+    )
+    check(assert_type(s.str.rpartition(sep=";", expand=False), pd.Series), pd.Series)
+
+
+def test_series_overloads_extract():
+    s = pd.Series(
+        ["appl;ep", "ban;anap", "Cherr;yp", "DATEp", "eGGp;LANTp", "12;3p", "23.45p"]
+    )
+    check(assert_type(s.str.extract(r"[ab](\d)"), pd.DataFrame), pd.DataFrame)
+    check(
+        assert_type(s.str.extract(r"[ab](\d)", expand=True), pd.DataFrame), pd.DataFrame
+    )
+    check(assert_type(s.str.extract(r"[ab](\d)", expand=False), pd.Series), pd.Series)
+    check(
+        assert_type(s.str.extract(r"[ab](\d)", re.IGNORECASE, False), pd.Series),
+        pd.Series,
+    )