From 92f1de68ac57a82bc219da35e69e346d8dfbb1da Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 11 May 2021 12:46:03 +0100 Subject: [PATCH 1/2] [ArrowStringArray] REF: str.extract argument validation --- pandas/core/strings/accessor.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 9f8c9fa2f0515..ddea691e105d1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2,15 +2,18 @@ from functools import wraps import re from typing import ( + TYPE_CHECKING, Dict, List, Optional, + Union, ) import warnings import numpy as np import pandas._libs.lib as lib +from pandas._typing import FrameOrSeriesUnion from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -31,6 +34,9 @@ from pandas.core.base import NoNewAttributesMixin +if TYPE_CHECKING: + from pandas import Index + _shared_docs: Dict[str, str] = {} _cpython_optimized_encoders = ( "utf-8", @@ -2274,7 +2280,9 @@ def findall(self, pat, flags=0): return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): + def extract( + self, pat: str, flags: int = 0, expand: bool = True + ) -> Union[FrameOrSeriesUnion, "Index"]: r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -2355,6 +2363,16 @@ def extract(self, pat, flags=0, expand=True): 2 NaN dtype: object """ + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + regex = re.compile(pat, flags=flags) + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex): + raise ValueError("only one regex group is supported with Index") + # TODO: dispatch return str_extract(self, pat, flags, expand=expand) @@ -3008,8 +3026,6 @@ def cat_core(list_of_columns: List, sep: str): def _groups_or_na_fun(regex): """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") empty_row = [np.nan] * regex.groups def f(x): @@ -3066,8 +3082,6 @@ def _str_extract_noexpand(arr, pat, flags=0): # not dispatching, so we have to reconstruct here. result = pd_array(result, dtype=result_dtype) else: - if isinstance(arr, ABCIndex): - raise ValueError("only one regex group is supported with Index") name = None names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] @@ -3120,8 +3134,6 @@ def _str_extract_frame(arr, pat, flags=0): def str_extract(arr, pat, flags=0, expand=True): - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") if expand: result = _str_extract_frame(arr._orig, pat, flags=flags) return result.__finalize__(arr._orig, method="str_extract") From b627de9a7ffec731013d48ffd56d0209c13dbdc4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 May 2021 09:28:22 +0100 Subject: [PATCH 2/2] isort --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 55cc5123a23d1..2646ddfa45b58 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -7,8 +7,8 @@ Hashable, List, Optional, - Union, Pattern, + Union, ) import warnings