From fee60551fd03d62e081e72b37b487541631d1f9a Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 22 Nov 2020 18:23:11 +0100 Subject: [PATCH 01/10] str extract with default value --- pandas/core/strings/accessor.py | 28 ++++++++++++++++------------ pandas/tests/test_strings.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7d6a2bf1d776d..73589b0f6e440 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2222,7 +2222,7 @@ def findall(self, pat, flags=0): return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): + def extract(self, pat, flags=0, expand=True, default=None): r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -2304,7 +2304,7 @@ def extract(self, pat, flags=0, expand=True): dtype: object """ # TODO: dispatch - return str_extract(self, pat, flags, expand=expand) + return str_extract(self, pat, flags, expand=expand, default=default) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags=0): @@ -2948,11 +2948,12 @@ def cat_core(list_of_columns: List, sep: str): return np.sum(arr_with_sep, axis=0) -def _groups_or_na_fun(regex): +def _groups_or_na_fun(regex, default=None): """Used in both extract_noexpand and extract_frame""" if regex.groups == 0: raise ValueError("pattern contains no capture groups") empty_row = [np.nan] * regex.groups + default = [default] * regex.groups def f(x): if not isinstance(x, str): @@ -2960,6 +2961,8 @@ def f(x): m = regex.search(x) if m: return [np.nan if item is None else item for item in m.groups()] + elif not m and default: + return default else: return empty_row @@ -2985,7 +2988,7 @@ def _get_single_group_name(rx): return None -def _str_extract_noexpand(arr, pat, flags=0): +def _str_extract_noexpand(arr, pat, flags=0, default=None): """ Find groups in each string in the Series using passed regular expression. This function is called from @@ -2996,11 +2999,12 @@ def _str_extract_noexpand(arr, pat, flags=0): from pandas import DataFrame, array regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) + groups_or_na = _groups_or_na_fun(regex, default) result_dtype = _result_dtype(arr) if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + # result = np.array([groups_or_na(val, default)[0] for val in arr], dtype=object) + result = [groups_or_na(val)[0] for val in arr] name = _get_single_group_name(regex) # not dispatching, so we have to reconstruct here. result = array(result, dtype=result_dtype) @@ -3015,7 +3019,7 @@ def _str_extract_noexpand(arr, pat, flags=0): else: dtype = _result_dtype(arr) result = DataFrame( - [groups_or_na(val) for val in arr], + [groups_or_na(val, default) for val in arr], columns=columns, index=arr.index, dtype=dtype, @@ -3023,7 +3027,7 @@ def _str_extract_noexpand(arr, pat, flags=0): return result, name -def _str_extract_frame(arr, pat, flags=0): +def _str_extract_frame(arr, pat, flags=0, default=None): """ For each subject string in the Series, extract groups from the first match of regular expression pat. This function is called from @@ -3033,7 +3037,7 @@ def _str_extract_frame(arr, pat, flags=0): from pandas import DataFrame regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) + groups_or_na = _groups_or_na_fun(regex, default=default) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] @@ -3052,14 +3056,14 @@ def _str_extract_frame(arr, pat, flags=0): ) -def str_extract(arr, pat, flags=0, expand=True): +def str_extract(arr, pat, flags=0, expand=True, default=None): if not isinstance(expand, bool): raise ValueError("expand must be True or False") if expand: - result = _str_extract_frame(arr._orig, pat, flags=flags) + result = _str_extract_frame(arr._orig, pat, flags=flags, default=default) return result.__finalize__(arr._orig, method="str_extract") else: - result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) + result, name = _str_extract_noexpand(arr._orig, pat, flags=flags, default=default) return arr._wrap_result(result, name=name, expand=expand) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 538a52d84b73a..e4fae840dfa69 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3670,3 +3670,19 @@ def test_str_get_stringarray_multiple_nans(): result = s.str.get(2) expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) tm.assert_series_equal(result, expected) + + +def test_str_extract_default_value_no_expand(): + # GH 38001 + df = DataFrame({'A': ['a84', 'abcd', '99string', np.nan]}) + result = df['A'].str.extract(r'(\d+)', expand=False, default='missing') + expected = Series(['84', 'missing', '99', np.nan]) + tm.assert_series_equal(result, expected) + + +def test_str_extract_default_value_with_expand(): + # GH 38001 + df = DataFrame({'A': ['a84', 'abcd', '99string', np.nan]}) + result = df['A'].str.extract(r'(\d+)', expand=True, default='missing') + expected = DataFrame({0: ['84', 'missing', '99', np.nan]}) + tm.assert_frame_equal(result, expected) From 4674af368694855e2aaa39cde3ad5c70f8cff185 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 22 Nov 2020 18:26:07 +0100 Subject: [PATCH 02/10] changes black --- pandas/core/strings/accessor.py | 4 +++- pandas/tests/test_strings.py | 12 ++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 73589b0f6e440..5bf0bc950506d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3063,7 +3063,9 @@ def str_extract(arr, pat, flags=0, expand=True, default=None): result = _str_extract_frame(arr._orig, pat, flags=flags, default=default) return result.__finalize__(arr._orig, method="str_extract") else: - result, name = _str_extract_noexpand(arr._orig, pat, flags=flags, default=default) + result, name = _str_extract_noexpand( + arr._orig, pat, flags=flags, default=default + ) return arr._wrap_result(result, name=name, expand=expand) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index e4fae840dfa69..e06d9623bb728 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3674,15 +3674,15 @@ def test_str_get_stringarray_multiple_nans(): def test_str_extract_default_value_no_expand(): # GH 38001 - df = DataFrame({'A': ['a84', 'abcd', '99string', np.nan]}) - result = df['A'].str.extract(r'(\d+)', expand=False, default='missing') - expected = Series(['84', 'missing', '99', np.nan]) + df = DataFrame({"A": ["a84", "abcd", "99string", np.nan]}) + result = df["A"].str.extract(r"(\d+)", expand=False, default="missing") + expected = Series(["84", "missing", "99", np.nan]) tm.assert_series_equal(result, expected) def test_str_extract_default_value_with_expand(): # GH 38001 - df = DataFrame({'A': ['a84', 'abcd', '99string', np.nan]}) - result = df['A'].str.extract(r'(\d+)', expand=True, default='missing') - expected = DataFrame({0: ['84', 'missing', '99', np.nan]}) + df = DataFrame({"A": ["a84", "abcd", "99string", np.nan]}) + result = df["A"].str.extract(r"(\d+)", expand=True, default="missing") + expected = DataFrame({0: ["84", "missing", "99", np.nan]}) tm.assert_frame_equal(result, expected) From cb8c18c4565b13440cf1fb19fe6c31b70e07dc73 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 22 Nov 2020 18:27:58 +0100 Subject: [PATCH 03/10] added whatsnew entry --- doc/source/whatsnew/v1.2.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 996fb828010ba..f42fe56b0ac0d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -235,7 +235,6 @@ Other enhancements - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) -- - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) @@ -253,6 +252,8 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- :meth:`Series.str.extract` accepts a default value argument (:issue:`38001`) + .. --------------------------------------------------------------------------- From 8efcd59f2302269f254567098b11f0778e22acb3 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 22 Nov 2020 18:29:47 +0100 Subject: [PATCH 04/10] removed commented line --- pandas/core/strings/accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5bf0bc950506d..60f7dc0d1042a 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3003,7 +3003,6 @@ def _str_extract_noexpand(arr, pat, flags=0, default=None): result_dtype = _result_dtype(arr) if regex.groups == 1: - # result = np.array([groups_or_na(val, default)[0] for val in arr], dtype=object) result = [groups_or_na(val)[0] for val in arr] name = _get_single_group_name(regex) # not dispatching, so we have to reconstruct here. From d2e23d5dc89221578343a3e884a0b97084439772 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 22 Nov 2020 20:42:02 +0100 Subject: [PATCH 05/10] remove wrong argument --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 60f7dc0d1042a..e01173399bce7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3018,7 +3018,7 @@ def _str_extract_noexpand(arr, pat, flags=0, default=None): else: dtype = _result_dtype(arr) result = DataFrame( - [groups_or_na(val, default) for val in arr], + [groups_or_na(val) for val in arr], columns=columns, index=arr.index, dtype=dtype, From 54645520cc623b55ee815d5e1d98b1232c057f49 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 22 Nov 2020 21:48:09 +0100 Subject: [PATCH 06/10] add name to series --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index e06d9623bb728..fd989e789036a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3676,7 +3676,7 @@ def test_str_extract_default_value_no_expand(): # GH 38001 df = DataFrame({"A": ["a84", "abcd", "99string", np.nan]}) result = df["A"].str.extract(r"(\d+)", expand=False, default="missing") - expected = Series(["84", "missing", "99", np.nan]) + expected = Series(["84", "missing", "99", np.nan], name='A') tm.assert_series_equal(result, expected) From 1cf8d2b02da97c9eab52b8ffcd826ea87217bca2 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Sun, 22 Nov 2020 21:55:58 +0100 Subject: [PATCH 07/10] black --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index fd989e789036a..aef62423f5ce7 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3676,7 +3676,7 @@ def test_str_extract_default_value_no_expand(): # GH 38001 df = DataFrame({"A": ["a84", "abcd", "99string", np.nan]}) result = df["A"].str.extract(r"(\d+)", expand=False, default="missing") - expected = Series(["84", "missing", "99", np.nan], name='A') + expected = Series(["84", "missing", "99", np.nan], name="A") tm.assert_series_equal(result, expected) From 15deae7e195bd420ba823714aca026c7ccd28ac8 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 22 Dec 2020 00:14:09 +0100 Subject: [PATCH 08/10] reset whatsnew v1.2.0 --- doc/source/whatsnew/v1.2.0.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e47691cd91489..bbdcd183f65e1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -302,9 +302,6 @@ Other enhancements - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) -- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) -- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) -- Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) - Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`) - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) From 04b018211cc526229dfa2b1aa1a6002261bc705d Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 22 Dec 2020 00:19:59 +0100 Subject: [PATCH 09/10] replace default by fill_value --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/strings/accessor.py | 28 +++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index bb833a491fd0c..2f7582921a400 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -41,7 +41,7 @@ Other enhancements - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) - Improve error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) -- :meth:`Series.str.extract` accepts a ``fill_value`` value argument to fill non matchine values (:issue:`38001`) +- :meth:`Series.str.extract` accepts the ``fill_value`` argument to fill non matching values (:issue:`38001`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d10e96cfae168..470ea30519326 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2218,7 +2218,7 @@ def findall(self, pat, flags=0): return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True, default=None): + def extract(self, pat, flags=0, expand=True, fill_value=None): r""" Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -2237,6 +2237,8 @@ def extract(self, pat, flags=0, expand=True, default=None): If True, return DataFrame with one column per capture group. If False, return a Series/Index if there is one capture group or DataFrame if there are multiple capture groups. + fill_value: str, default None + Value to use as default value when the regex does not match. Returns ------- @@ -2300,7 +2302,7 @@ def extract(self, pat, flags=0, expand=True, default=None): dtype: object """ # TODO: dispatch - return str_extract(self, pat, flags, expand=expand, default=default) + return str_extract(self, pat, flags, expand=expand, fill_value=fill_value) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags=0): @@ -2950,12 +2952,12 @@ def cat_core(list_of_columns: List, sep: str): return np.sum(arr_with_sep, axis=0) -def _groups_or_na_fun(regex, default=None): +def _groups_or_na_fun(regex, fill_value=None): """Used in both extract_noexpand and extract_frame""" if regex.groups == 0: raise ValueError("pattern contains no capture groups") empty_row = [np.nan] * regex.groups - default = [default] * regex.groups + fill_value = [fill_value] * regex.groups def f(x): if not isinstance(x, str): @@ -2963,8 +2965,8 @@ def f(x): m = regex.search(x) if m: return [np.nan if item is None else item for item in m.groups()] - elif not m and default: - return default + elif not m and fill_value: + return fill_value else: return empty_row @@ -2990,7 +2992,7 @@ def _get_single_group_name(rx): return None -def _str_extract_noexpand(arr, pat, flags=0, default=None): +def _str_extract_noexpand(arr, pat, flags=0, fill_value=None): """ Find groups in each string in the Series using passed regular expression. This function is called from @@ -3001,7 +3003,7 @@ def _str_extract_noexpand(arr, pat, flags=0, default=None): from pandas import DataFrame, array regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex, default) + groups_or_na = _groups_or_na_fun(regex, fill_value) result_dtype = _result_dtype(arr) if regex.groups == 1: @@ -3028,7 +3030,7 @@ def _str_extract_noexpand(arr, pat, flags=0, default=None): return result, name -def _str_extract_frame(arr, pat, flags=0, default=None): +def _str_extract_frame(arr, pat, flags=0, fill_value=None): """ For each subject string in the Series, extract groups from the first match of regular expression pat. This function is called from @@ -3038,7 +3040,7 @@ def _str_extract_frame(arr, pat, flags=0, default=None): from pandas import DataFrame regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex, default=default) + groups_or_na = _groups_or_na_fun(regex, fill_value=fill_value) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] @@ -3057,15 +3059,15 @@ def _str_extract_frame(arr, pat, flags=0, default=None): ) -def str_extract(arr, pat, flags=0, expand=True, default=None): +def str_extract(arr, pat, flags=0, expand=True, fill_value=None): if not isinstance(expand, bool): raise ValueError("expand must be True or False") if expand: - result = _str_extract_frame(arr._orig, pat, flags=flags, default=default) + result = _str_extract_frame(arr._orig, pat, flags=flags, fill_value=fill_value) return result.__finalize__(arr._orig, method="str_extract") else: result, name = _str_extract_noexpand( - arr._orig, pat, flags=flags, default=default + arr._orig, pat, flags=flags, fill_value=fill_value ) return arr._wrap_result(result, name=name, expand=expand) From fba6cdb6edf391d3fe6521ec1bf330668d7f178c Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 22 Dec 2020 00:22:26 +0100 Subject: [PATCH 10/10] replace argument tests --- pandas/tests/test_strings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index aef62423f5ce7..9ef9435d7d84a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3675,7 +3675,7 @@ def test_str_get_stringarray_multiple_nans(): def test_str_extract_default_value_no_expand(): # GH 38001 df = DataFrame({"A": ["a84", "abcd", "99string", np.nan]}) - result = df["A"].str.extract(r"(\d+)", expand=False, default="missing") + result = df["A"].str.extract(r"(\d+)", expand=False, fill_value="missing") expected = Series(["84", "missing", "99", np.nan], name="A") tm.assert_series_equal(result, expected) @@ -3683,6 +3683,6 @@ def test_str_extract_default_value_no_expand(): def test_str_extract_default_value_with_expand(): # GH 38001 df = DataFrame({"A": ["a84", "abcd", "99string", np.nan]}) - result = df["A"].str.extract(r"(\d+)", expand=True, default="missing") + result = df["A"].str.extract(r"(\d+)", expand=True, fill_value="missing") expected = DataFrame({0: ["84", "missing", "99", np.nan]}) tm.assert_frame_equal(result, expected)