diff --git a/doc/source/release.rst b/doc/source/release.rst index eb40b9474f41b..75b9581a32ba6 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -79,6 +79,7 @@ Improvements to existing features allow multiple axes to be used to operate on slabs of a ``Panel`` - The ``ArrayFormatter``s for ``datetime`` and ``timedelta64`` now intelligently limit precision based on the values in the array (:issue:`3401`) + - perf improvements to Series.str.extract (:issue:`5944`) .. _release.bug_fixes-0.13.1: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 77abd82b7ecc3..588a81e3cf80d 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -439,41 +439,28 @@ def str_extract(arr, pat, flags=0): """ regex = re.compile(pat, flags=flags) - # just to be safe, check this if regex.groups == 0: raise ValueError("This pattern contains no groups to capture.") - elif regex.groups == 1: - def f(x): - if not isinstance(x, compat.string_types): - return None - m = regex.search(x) - if m: - return m.groups()[0] # may be None - else: - return None + empty_row = [np.nan]*regex.groups + def f(x): + if not isinstance(x, compat.string_types): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row + if regex.groups == 1: + result = Series([f(val)[0] for val in arr], name=regex.groupindex.get(1)) else: - empty_row = Series(regex.groups * [None]) - - def f(x): - if not isinstance(x, compat.string_types): - return empty_row - m = regex.search(x) - if m: - return Series(list(m.groups())) # may contain None - else: - return empty_row - result = arr.apply(f) - result.replace({None: np.nan}, inplace=True) - if regex.groups > 1: - result = DataFrame(result) # Don't rely on the wrapper; name columns. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - result.columns = [names.get(1 + i, i) for i in range(regex.groups)] - else: - result.name = regex.groupindex.get(0) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + result = DataFrame([f(val) for val in arr], columns=columns) return result + def str_join(arr, sep): """ Join lists contained as elements in array, a la str.join