Skip to content

Commit 31b6524

Browse files
committed
PERF: StringArray construction
1 parent 229722e commit 31b6524

File tree

1 file changed

+29
-15
lines changed

1 file changed

+29
-15
lines changed

pandas/core/arrays/string_.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import operator
2-
from typing import TYPE_CHECKING, Type, Union
2+
from typing import TYPE_CHECKING, Optional, Type, Union
33

44
import numpy as np
55

@@ -122,6 +122,9 @@ class StringArray(PandasArray):
122122
123123
copy : bool, default False
124124
Whether to copy the array of data.
125+
convert : bool, default False
126+
If true, force conversion of non-na scalars to strings.
127+
If False, raises a ValueError, if a scalar is neither a string nor na.
125128
126129
Attributes
127130
----------
@@ -162,7 +165,15 @@ class StringArray(PandasArray):
162165
['1', '1']
163166
Length: 2, dtype: string
164167
165-
However, instantiating StringArrays directly with non-strings will raise an error.
168+
Instantiating StringArrays directly with non-strings will raise an error unless
169+
``convert=True``.
170+
171+
>>> pd.arrays.StringArray(['1', 1])
172+
TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got list)
173+
>>> pd.arrays.StringArray(['1', 1], convert=True)
174+
<StringArray>
175+
['1', '1']
176+
Length: 2, dtype: string
166177
167178
For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
168179
@@ -175,22 +186,30 @@ class StringArray(PandasArray):
175186
# undo the PandasArray hack
176187
_typ = "extension"
177188

178-
def __init__(self, values, copy=False):
189+
def __init__(self, values, copy=False, convert: bool = False):
179190
values = extract_array(values)
191+
if not isinstance(values, type(self)):
192+
if convert:
193+
values = lib.ensure_string_array(
194+
values, na_value=StringDtype.na_value, copy=copy
195+
)
196+
else:
197+
self._validate(values)
180198

181199
super().__init__(values, copy=copy)
182200
self._dtype = StringDtype()
183-
if not isinstance(values, type(self)):
184-
self._validate()
185201

186-
def _validate(self):
202+
def _validate(self, values: Optional[np.ndarray] = None) -> None:
187203
"""Validate that we only store NA or strings."""
188-
if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
204+
if values is None:
205+
values = self._ndarray
206+
207+
if len(values) and not lib.is_string_array(values, skipna=True):
189208
raise ValueError("StringArray requires a sequence of strings or pandas.NA")
190-
if self._ndarray.dtype != "object":
209+
if values.dtype != "object":
191210
raise ValueError(
192211
"StringArray requires a sequence of strings or pandas.NA. Got "
193-
f"'{self._ndarray.dtype}' dtype instead."
212+
f"'{values.dtype}' dtype instead."
194213
)
195214

196215
@classmethod
@@ -200,12 +219,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
200219

201220
result = np.asarray(scalars, dtype="object")
202221

203-
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
204-
result = lib.ensure_string_array(
205-
result, na_value=StringDtype.na_value, copy=copy
206-
)
207-
208-
return cls(result)
222+
return cls(result, copy=copy, convert=True)
209223

210224
@classmethod
211225
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):

0 commit comments

Comments
 (0)