From 14ca8045452ca4570468238024306be8d44c0a76 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Nov 2019 10:37:03 -0600 Subject: [PATCH 01/12] take 2 --- pandas/_libs/lib.pyx | 14 +++++++---- pandas/core/strings.py | 42 +++++++++++++++++++++++++++++++++ pandas/tests/test_strings.py | 45 +++++++++++++++++++++++++++++++----- 3 files changed, 91 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c1fd46f4bba9e..4395ca0de9082 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2208,9 +2208,12 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects +_no_default = object() + @cython.boundscheck(False) @cython.wraparound(False) -def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1): +def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, + object na_value=_no_default, object dtype=object): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -2225,14 +2228,17 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1) """ cdef: Py_ssize_t i, n - ndarray[object] result + ndarray result object val n = len(arr) - result = np.empty(n, dtype=object) + result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: - val = arr[i] + if na_value is _no_default: + val = arr[i] + else: + val = na_value else: val = f(arr[i]) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a6e0c12526d8a..d3d0feb7887f7 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -15,10 +15,14 @@ ensure_object, is_bool_dtype, is_categorical_dtype, + is_extension_array_dtype, is_integer, + is_integer_dtype, is_list_like, + is_object_dtype, is_re, is_scalar, + is_string_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -31,6 +35,7 @@ from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com +from pandas.core.construction import extract_array _cpython_optimized_encoders = ( "utf-8", @@ -109,9 +114,46 @@ def cat_safe(list_of_columns: List, sep: str): def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA + if is_extension_array_dtype(arr.dtype): + return _map_ea(f, arr, na_value=na_result, dtype=dtype) return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) +def _map_ea(f, arr, na_value, dtype): + from pandas.arrays import IntegerArray, StringArray + + arr = extract_array(arr, extract_numpy=True) + mask = isna(arr) + + assert isinstance(arr, StringArray) + arr = arr._ndarray + + if is_integer_dtype(dtype): + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype("int64"), + ) + + if not na_value_is_na: + mask[:] = False + + return IntegerArray(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + result = lib.map_infer_mask(arr, f, mask.view("uint8"), na_value=na_value) + return StringArray(result) + # TODO: BooleanArray + else: + return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f5d28ec82d1d4..7a634119b1206 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -731,7 +731,10 @@ def test_count(self): tm.assert_series_equal(result, exp) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_count(mixed, "a") xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_numpy_array_equal(rs, xp) @@ -755,14 +758,14 @@ def test_contains(self): expected = np.array([False, np.nan, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) - values = ["foo", "xyz", "fooommm__foo", "mmm_"] + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex - values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"] + values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) result = strings.str_contains(values, "FOO|mmm", case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) @@ -773,7 +776,10 @@ def test_contains(self): tm.assert_numpy_array_equal(result, expected) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_contains(mixed, "o") xp = np.array( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], @@ -869,7 +875,10 @@ def test_endswith(self): tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed - mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0] + mixed = np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) rs = strings.str_endswith(mixed, "f") xp = np.array( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], @@ -3488,10 +3497,13 @@ def test_casefold(self): def test_string_array(any_string_method): + method_name, args, kwargs = any_string_method + if method_name == "decode": + pytest.skip("decode requires bytes.") + data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype="string") - method_name, args, kwargs = any_string_method expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) @@ -3502,8 +3514,29 @@ def test_string_array(any_string_method): ): assert result.dtype == "string" result = result.astype(object) + + elif expected.dtype == "float" and expected.isna().any(): + assert result.dtype == "Int64" + result = result.astype("float") + elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "method,args,expected", + [ + ("count", ("a",), [2, None]), + ("find", ("a",), [0, None]), + ("index", ("a",), [0, None]), + ("rindex", ("a",), [2, None]), + ], +) +def test_string_array_numeric_integer_array(method, args, expected): + s = Series(["aba", None], dtype="string") + result = getattr(s.str, method)(*args) + expected = Series(expected, dtype="Int64") + tm.assert_series_equal(result, expected) From 07fc53d7ce091962afed07ef098751785a537d94 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Nov 2019 12:04:06 -0600 Subject: [PATCH 02/12] updates --- doc/source/user_guide/text.rst | 36 ++++++++++++++++++++++++++++++++-- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/_libs/lib.pyx | 1 + pandas/core/strings.py | 18 +++++++++++------ 4 files changed, 48 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index d521c745ccfe5..827422a6f14b9 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -13,7 +13,7 @@ Text Data Types .. versionadded:: 1.0.0 -There are two main ways to store text data +There are two ways to store text data in pandas: 1. ``object`` -dtype NumPy array. 2. :class:`StringDtype` extension type. @@ -63,7 +63,39 @@ Or ``astype`` after the ``Series`` or ``DataFrame`` is created s s.astype("string") -Everything that follows in the rest of this document applies equally to +.. _text.differences: + +Behavior differences +^^^^^^^^^^^^^^^^^^^^ + +These are places where the behavior of ``StringDtype`` objects differ from +``object`` dtype + +l. For ``StringDtype``, :ref:`String accessor methods` + that return **numeric** output will always return a nullable integer dtype, + rather either int or float dtype, depending on the presence of NA values. + + .. ipython:: python + + s = pd.Series(["a", None, "b"], dtype="string") + s.str.count("a") + s.dropna().str.count("a") + + Both outputs are ``Int64`` dtype. Compare that with object-dtype + + .. ipython:: python + + s.astype(object).str.count("a") + s.astype(object).dropna().str.count("a") + + When NA values are present, the output dtype is float64. + +2. Some string methods, like :meth:`Series.str.decode` are not available + on ``StringArray`` because ``StringArray`` only holds strings, not + bytes. + + +Everything else that follows in the rest of this document applies equally to ``string`` and ``object`` dtype. .. _text.string_methods: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 896ae91c68642..c9e6623951d53 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -63,7 +63,7 @@ Previously, strings were typically stored in object-dtype NumPy arrays. ``StringDtype`` is currently considered experimental. The implementation and parts of the API may change without warning. -The text extension type solves several issues with object-dtype NumPy arrays: +The ``'string'`` extension type solves several issues with object-dtype NumPy arrays: 1. You can accidentally store a *mixture* of strings and non-strings in an ``object`` dtype array. A ``StringArray`` can only store strings. diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4395ca0de9082..84a6804df0c3e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2210,6 +2210,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, _no_default = object() + @cython.boundscheck(False) @cython.wraparound(False) def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, diff --git a/pandas/core/strings.py b/pandas/core/strings.py index d3d0feb7887f7..bb7e3d5f8a647 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2,7 +2,7 @@ from functools import wraps import re import textwrap -from typing import Dict, List +from typing import TYPE_CHECKING, Any, Callable, Dict, List import warnings import numpy as np @@ -32,11 +32,15 @@ ) from pandas.core.dtypes.missing import isna +from pandas._typing import ArrayLike, Dtype from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin import pandas.core.common as com from pandas.core.construction import extract_array +if TYPE_CHECKING: + from pandas.arrays import StringArray + _cpython_optimized_encoders = ( "utf-8", "utf8", @@ -115,14 +119,16 @@ def cat_safe(list_of_columns: List, sep: str): def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA if is_extension_array_dtype(arr.dtype): + arr = extract_array(arr, extract_numpy=True) return _map_ea(f, arr, na_value=na_result, dtype=dtype) return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) -def _map_ea(f, arr, na_value, dtype): +def _map_ea( + func: Callable, arr: "StringArray", na_value: Any, dtype: Dtype +) -> ArrayLike: from pandas.arrays import IntegerArray, StringArray - arr = extract_array(arr, extract_numpy=True) mask = isna(arr) assert isinstance(arr, StringArray) @@ -134,7 +140,7 @@ def _map_ea(f, arr, na_value, dtype): na_value = 1 result = lib.map_infer_mask( arr, - f, + func, mask.view("uint8"), convert=False, na_value=na_value, @@ -147,11 +153,11 @@ def _map_ea(f, arr, na_value, dtype): return IntegerArray(result, mask) elif is_string_dtype(dtype) and not is_object_dtype(dtype): - result = lib.map_infer_mask(arr, f, mask.view("uint8"), na_value=na_value) + result = lib.map_infer_mask(arr, func, mask.view("uint8"), na_value=na_value) return StringArray(result) # TODO: BooleanArray else: - return lib.map_infer_mask(arr, f, mask.view("uint8")) + return lib.map_infer_mask(arr, func, mask.view("uint8")) def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): From a1470819d200bc2be7864f80998fbc4685f80b02 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 16 Nov 2019 16:39:30 -0600 Subject: [PATCH 03/12] updates --- doc/source/user_guide/text.rst | 2 +- pandas/_libs/lib.pyx | 9 +++++++++ pandas/core/strings.py | 7 +++++-- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 827422a6f14b9..554414be971b4 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -71,7 +71,7 @@ Behavior differences These are places where the behavior of ``StringDtype`` objects differ from ``object`` dtype -l. For ``StringDtype``, :ref:`String accessor methods` +l. For ``StringDtype``, :ref:`string accessor methods` that return **numeric** output will always return a nullable integer dtype, rather either int or float dtype, depending on the presence of NA values. diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 84a6804df0c3e..15dc4a018de37 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2222,6 +2222,15 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, ---------- arr : ndarray f : function + mask : ndarray + uint8 dtype ndarray indicating values not to apply `f` to. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray + na_value : Any, optional + The result value to use for masked values. By default, the + input value is used + dtype : type + The numpy dtype to use for the result ndarray. Returns ------- diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bb7e3d5f8a647..12dea64238a5f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -119,7 +119,7 @@ def cat_safe(list_of_columns: List, sep: str): def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA if is_extension_array_dtype(arr.dtype): - arr = extract_array(arr, extract_numpy=True) + arr = extract_array(arr) return _map_ea(f, arr, na_value=na_result, dtype=dtype) return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) @@ -153,7 +153,10 @@ def _map_ea( return IntegerArray(result, mask) elif is_string_dtype(dtype) and not is_object_dtype(dtype): - result = lib.map_infer_mask(arr, func, mask.view("uint8"), na_value=na_value) + # i.e. StringDtype + result = lib.map_infer_mask( + arr, func, mask.view("uint8"), convert=False, na_value=na_value + ) return StringArray(result) # TODO: BooleanArray else: From 36b1bf71cd0b01698e08e253c87d61cb5f28c147 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 17 Nov 2019 09:54:59 -0600 Subject: [PATCH 04/12] type callable --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 12dea64238a5f..182d07c40425f 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -125,7 +125,7 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): def _map_ea( - func: Callable, arr: "StringArray", na_value: Any, dtype: Dtype + func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype ) -> ArrayLike: from pandas.arrays import IntegerArray, StringArray From d01ddfef46ba5e8de20f5d5512dd119945e4ecb1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 17 Nov 2019 13:53:55 -0600 Subject: [PATCH 05/12] wip --- doc/source/user_guide/text.rst | 1 + pandas/core/strings.py | 3 +++ pandas/tests/test_strings.py | 14 +++++++------- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 554414be971b4..149ff14002ffa 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -78,6 +78,7 @@ l. For ``StringDtype``, :ref:`string accessor methods` .. ipython:: python s = pd.Series(["a", None, "b"], dtype="string") + s s.str.count("a") s.dropna().str.count("a") diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 182d07c40425f..2861a86da6554 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -160,6 +160,9 @@ def _map_ea( return StringArray(result) # TODO: BooleanArray else: + import pdb + + pdb.set_trace() return lib.map_infer_mask(arr, func, mask.view("uint8")) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 7a634119b1206..cde78c5f24048 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3527,16 +3527,16 @@ def test_string_array(any_string_method): @pytest.mark.parametrize( - "method,args,expected", + "method,expected", [ - ("count", ("a",), [2, None]), - ("find", ("a",), [0, None]), - ("index", ("a",), [0, None]), - ("rindex", ("a",), [2, None]), + ("count", [2, None]), + ("find", [0, None]), + ("index", [0, None]), + ("rindex", [2, None]), ], ) -def test_string_array_numeric_integer_array(method, args, expected): +def test_string_array_numeric_integer_array(method, expected): s = Series(["aba", None], dtype="string") - result = getattr(s.str, method)(*args) + result = getattr(s.str, method)("a") expected = Series(expected, dtype="Int64") tm.assert_series_equal(result, expected) From 22a2c79a5c5f11c033907f3184db9a5055b992d1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 17 Nov 2019 14:22:52 -0600 Subject: [PATCH 06/12] more dtypes --- pandas/core/strings.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2861a86da6554..3d8cec65668f1 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -120,11 +120,11 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA if is_extension_array_dtype(arr.dtype): arr = extract_array(arr) - return _map_ea(f, arr, na_value=na_result, dtype=dtype) + return _stringarray_map(f, arr, na_value=na_result, dtype=dtype) return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) -def _map_ea( +def _stringarray_map( func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype ) -> ArrayLike: from pandas.arrays import IntegerArray, StringArray @@ -160,9 +160,6 @@ def _map_ea( return StringArray(result) # TODO: BooleanArray else: - import pdb - - pdb.set_trace() return lib.map_infer_mask(arr, func, mask.view("uint8")) @@ -688,7 +685,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): raise ValueError("Cannot use a callable replacement when regex=False") f = lambda x: x.replace(pat, repl, n) - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_repeat(arr, repeats): @@ -739,7 +736,7 @@ def scalar_rep(x): except TypeError: return str.__mul__(x, repeats) - return _na_map(scalar_rep, arr) + return _na_map(scalar_rep, arr, dtype=str) else: def rep(x, r): @@ -1204,7 +1201,7 @@ def str_join(arr, sep): 4 NaN dtype: object """ - return _na_map(sep.join, arr) + return _na_map(sep.join, arr, dtype=str) def str_findall(arr, pat, flags=0): @@ -1435,7 +1432,7 @@ def str_pad(arr, width, side="left", fillchar=" "): else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_split(arr, pat=None, n=None): @@ -1541,7 +1538,7 @@ def str_slice(arr, start=None, stop=None, step=None): """ obj = slice(start, stop, step) f = lambda x: x[obj] - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_slice_replace(arr, start=None, stop=None, repl=None): @@ -1632,7 +1629,7 @@ def f(x): y += x[local_stop:] return y - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_strip(arr, to_strip=None, side="both"): @@ -1657,7 +1654,7 @@ def str_strip(arr, to_strip=None, side="both"): f = lambda x: x.rstrip(to_strip) else: # pragma: no cover raise ValueError("Invalid side") - return _na_map(f, arr) + return _na_map(f, arr, dtype=str) def str_wrap(arr, width, **kwargs): @@ -1721,7 +1718,7 @@ def str_wrap(arr, width, **kwargs): tw = textwrap.TextWrapper(**kwargs) - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr) + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) def str_translate(arr, table): @@ -1741,7 +1738,7 @@ def str_translate(arr, table): ------- Series or Index """ - return _na_map(lambda x: x.translate(table), arr) + return _na_map(lambda x: x.translate(table), arr, dtype=str) def str_get(arr, i): @@ -3079,7 +3076,7 @@ def normalize(self, form): import unicodedata f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent) + result = _na_map(f, self._parent, dtype=str) return self._wrap_result(result) _shared_docs[ @@ -3277,31 +3274,37 @@ def rindex(self, sub, start=0, end=None): lambda x: x.lower(), name="lower", docstring=_shared_docs["casemethods"] % _doc_args["lower"], + dtype=str, ) upper = _noarg_wrapper( lambda x: x.upper(), name="upper", docstring=_shared_docs["casemethods"] % _doc_args["upper"], + dtype=str, ) title = _noarg_wrapper( lambda x: x.title(), name="title", docstring=_shared_docs["casemethods"] % _doc_args["title"], + dtype=str, ) capitalize = _noarg_wrapper( lambda x: x.capitalize(), name="capitalize", docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], + dtype=str, ) swapcase = _noarg_wrapper( lambda x: x.swapcase(), name="swapcase", docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], + dtype=str, ) casefold = _noarg_wrapper( lambda x: x.casefold(), name="casefold", docstring=_shared_docs["casemethods"] % _doc_args["casefold"], + dtype=str, ) _shared_docs[ From 0ea2d6b32dbcdf29cced7b36bbabb481200b4f8c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 18 Nov 2019 09:35:38 -0600 Subject: [PATCH 07/12] doc --- pandas/core/strings.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f5e213a126b62..7f29c53f02429 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -127,6 +127,28 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): def _stringarray_map( func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype ) -> ArrayLike: + """ + Map a callable over valid elements of a StringArrray. + + Parameters + ---------- + func : Callable[[str], Any] + Apply to each valid element. + arr : StringArray + na_value : Any + The value to use for missing values. By default, this is + the original value (NA). + dtype : Dtype + The result dtype to use. Specifying this aviods an intermediate + object-dtype allocation. + + Returns + ------- + ArrayLike + An ExtensionArray for integer or string dtypes, otherwise + an ndarray. + + """ from pandas.arrays import IntegerArray, StringArray mask = isna(arr) @@ -160,6 +182,10 @@ def _stringarray_map( return StringArray(result) # TODO: BooleanArray else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, func, mask.view("uint8")) From 358ce59dc39d944fb351f222b9ab480c9f5aa855 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 18 Nov 2019 11:56:20 -0600 Subject: [PATCH 08/12] fixups --- doc/source/user_guide/text.rst | 2 +- pandas/_libs/lib.pyx | 2 +- pandas/core/strings.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 149ff14002ffa..072871f89bdae 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -73,7 +73,7 @@ These are places where the behavior of ``StringDtype`` objects differ from l. For ``StringDtype``, :ref:`string accessor methods` that return **numeric** output will always return a nullable integer dtype, - rather either int or float dtype, depending on the presence of NA values. + rather than either int or float dtype, depending on the presence of NA values. .. ipython:: python diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 15dc4a018de37..aaf6456df8f8e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2229,7 +2229,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, na_value : Any, optional The result value to use for masked values. By default, the input value is used - dtype : type + dtype : numpy.dtype The numpy dtype to use for the result ndarray. Returns diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 7f29c53f02429..63dc27d0812b9 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -119,6 +119,7 @@ def cat_safe(list_of_columns: List, sep: str): def _na_map(f, arr, na_result=np.nan, dtype=object): # should really _check_ for NA if is_extension_array_dtype(arr.dtype): + # just StringDtype arr = extract_array(arr) return _stringarray_map(f, arr, na_value=na_result, dtype=dtype) return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) From 7b25d8db3ebee5022eaa5da82670c810f21ec56c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 18 Nov 2019 11:58:39 -0600 Subject: [PATCH 09/12] use asarray --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 63dc27d0812b9..782b738c9788e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -155,7 +155,7 @@ def _stringarray_map( mask = isna(arr) assert isinstance(arr, StringArray) - arr = arr._ndarray + arr = np.asarray(arr._ndarray) if is_integer_dtype(dtype): na_value_is_na = isna(na_value) From 46dfc2046f95ac3716d7a155f823c2022e22ff50 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 18 Nov 2019 11:58:53 -0600 Subject: [PATCH 10/12] use asarray 2 --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 782b738c9788e..1ed56e2d52ae2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -155,7 +155,7 @@ def _stringarray_map( mask = isna(arr) assert isinstance(arr, StringArray) - arr = np.asarray(arr._ndarray) + arr = np.asarray(arr) if is_integer_dtype(dtype): na_value_is_na = isna(na_value) From 3b20ffc1df1668cac8ad6ad1be4177b173c250ea Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 19 Nov 2019 08:00:32 -0600 Subject: [PATCH 11/12] fixup --- doc/source/whatsnew/v1.0.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9c0166a6a0bb4..4ce21139d5131 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -88,9 +88,17 @@ You can use the alias ``"string"`` as well. The usual string accessor methods work. Where appropriate, the return type of the Series or columns of a DataFrame will also have string dtype. +.. ipython:: python + s.str.upper() s.str.split('b', expand=True).dtypes +String accessor methods returning integers will return a value with :class:`Int64Dtype` + +.. ipython:: python + + s.str.count("a") + We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. From e4bae5e2e32da8f19c745b0583fa5d5ab553b436 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 19 Nov 2019 08:36:11 -0600 Subject: [PATCH 12/12] rename --- pandas/core/strings.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 1ed56e2d52ae2..413e7e85eb6fe 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -121,11 +121,11 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): if is_extension_array_dtype(arr.dtype): # just StringDtype arr = extract_array(arr) - return _stringarray_map(f, arr, na_value=na_result, dtype=dtype) - return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype) + return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) -def _stringarray_map( +def _map_stringarray( func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype ) -> ArrayLike: """ @@ -190,7 +190,7 @@ def _stringarray_map( return lib.map_infer_mask(arr, func, mask.view("uint8")) -def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): +def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) @@ -221,7 +221,7 @@ def g(x): except (TypeError, AttributeError): return na_value - return _map(g, arr, dtype=dtype) + return _map_object(g, arr, dtype=dtype) if na_value is not np.nan: np.putmask(result, mask, na_value) if result.dtype == object: