Skip to content

Commit be1f1fc

Browse files
committed
REF: StringArray._from_sequence
1 parent a0c8425 commit be1f1fc

File tree

4 files changed

+32
-22
lines changed

4 files changed

+32
-22
lines changed

doc/source/whatsnew/v1.1.1.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ Bug fixes
4444

4545
-
4646

47+
**Strings**
48+
49+
- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`)
50+
51+
4752
.. ---------------------------------------------------------------------------
4853
4954
.. _whatsnew_111.contributors:

pandas/_libs/lib.pyx

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,6 +1698,20 @@ cpdef bint is_string_array(ndarray values, bint skipna=False):
16981698
return validator.validate(values)
16991699

17001700

1701+
cpdef ndarray ensure_string_array(ndarray values, object na_value):
1702+
cdef:
1703+
Py_ssize_t i = 0, n = len(values)
1704+
1705+
for i in range(n):
1706+
val = values[i]
1707+
if not checknull(val):
1708+
values[i] = str(val)
1709+
else:
1710+
values[i] = na_value
1711+
1712+
return values
1713+
1714+
17011715
cdef class BytesValidator(Validator):
17021716
cdef inline bint is_value_typed(self, object value) except -1:
17031717
return isinstance(value, bytes)

pandas/core/arrays/string_.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,10 @@ class StringArray(PandasArray):
178178

179179
def __init__(self, values, copy=False):
180180
values = extract_array(values)
181-
skip_validation = isinstance(values, type(self))
182181

183182
super().__init__(values, copy=copy)
184183
self._dtype = StringDtype()
185-
if not skip_validation:
184+
if not isinstance(values, type(self)):
186185
self._validate()
187186

188187
def _validate(self):
@@ -196,28 +195,16 @@ def _validate(self):
196195
)
197196

198197
@classmethod
199-
def _from_sequence(cls, scalars, dtype=None, copy=False):
198+
def _from_sequence(cls, scalars, dtype=None, copy=True):
200199
if dtype:
201200
assert dtype == "string"
202201

203202
result = np.asarray(scalars, dtype="object")
204203
if copy and result is scalars:
205204
result = result.copy()
206205

207-
# Standardize all missing-like values to NA
208-
# TODO: it would be nice to do this in _validate / lib.is_string_array
209-
# We are already doing a scan over the values there.
210-
na_values = isna(result)
211-
has_nans = na_values.any()
212-
if has_nans and result is scalars:
213-
# force a copy now, if we haven't already
214-
result = result.copy()
215-
216-
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
217-
result = np.asarray(result, dtype=str)
218-
result = np.asarray(result, dtype="object")
219-
if has_nans:
220-
result[na_values] = StringDtype.na_value
206+
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
207+
result = lib.ensure_string_array(result, StringDtype.na_value)
221208

222209
return cls(result)
223210

pandas/tests/arrays/string_/test_string.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -206,12 +206,16 @@ def test_constructor_raises():
206206

207207
@pytest.mark.parametrize("copy", [True, False])
208208
def test_from_sequence_no_mutate(copy):
209-
a = np.array(["a", np.nan], dtype=object)
210-
original = a.copy()
211-
result = pd.arrays.StringArray._from_sequence(a, copy=copy)
212-
expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
209+
nan_arr = np.array(["a", np.nan], dtype=object)
210+
na_arr = np.array(["a", pd.NA], dtype=object)
211+
212+
result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
213+
expected = pd.arrays.StringArray(na_arr)
214+
213215
tm.assert_extension_array_equal(result, expected)
214-
tm.assert_numpy_array_equal(a, original)
216+
217+
expected = nan_arr if copy else na_arr
218+
tm.assert_numpy_array_equal(nan_arr, expected)
215219

216220

217221
def test_astype_int():

0 commit comments

Comments
 (0)