Skip to content

Commit da1ccc1

Browse files
authored
Merge pull request #2 from pandas-dev/master
Updating my fork
2 parents fea2974 + 13940c7 commit da1ccc1

File tree

20 files changed

+213
-96
lines changed

20 files changed

+213
-96
lines changed

asv_bench/benchmarks/strings.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@
77
from .pandas_vb_common import tm
88

99

10+
class Construction:
11+
12+
params = ["str", "string"]
13+
param_names = ["dtype"]
14+
15+
def setup(self, dtype):
16+
self.data = tm.rands_array(nchars=10 ** 5, size=10)
17+
18+
def time_construction(self, dtype):
19+
Series(self.data, dtype=dtype)
20+
21+
def peakmem_construction(self, dtype):
22+
Series(self.data, dtype=dtype)
23+
24+
1025
class Methods:
1126
def setup(self):
1227
self.s = Series(tm.makeStringIndex(10 ** 5))

doc/source/whatsnew/v1.1.1.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Fixed regressions
2727
- Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`)
2828
- Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`)
2929
- Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`)
30+
- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`)
3031
- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`)
3132

3233
.. ---------------------------------------------------------------------------
@@ -37,6 +38,7 @@ Bug fixes
3738
~~~~~~~~~
3839

3940
- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`).
41+
- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`).
4042

4143
Categorical
4244
^^^^^^^^^^^
@@ -75,6 +77,11 @@ Categorical
7577
- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`)
7678
-
7779

80+
**Strings**
81+
82+
- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`)
83+
84+
7885
.. ---------------------------------------------------------------------------
7986
8087
.. _whatsnew_111.contributors:

pandas/_libs/lib.pyx

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
618618

619619
@cython.wraparound(False)
620620
@cython.boundscheck(False)
621-
def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]:
622-
"""
623-
Convert all elements in an array to string.
621+
cpdef ndarray[object] ensure_string_array(
622+
arr,
623+
object na_value=np.nan,
624+
bint convert_na_value=True,
625+
bint copy=True,
626+
bint skipna=True,
627+
):
628+
"""Returns a new numpy array with object dtype and only strings and na values.
624629
625630
Parameters
626631
----------
627-
arr : ndarray
628-
The array whose elements we are casting.
629-
skipna : bool, default False
632+
arr : array-like
633+
The values to be converted to str, if needed.
634+
na_value : Any
635+
The value to use for na. For example, np.nan or pd.NA.
636+
convert_na_value : bool, default True
637+
If False, existing na values will be used unchanged in the new array.
638+
copy : bool, default True
639+
Whether to ensure that a new array is returned.
640+
skipna : bool, default True
630641
Whether or not to coerce nulls to their stringified form
631-
(e.g. NaN becomes 'nan').
642+
(e.g. if False, NaN becomes 'nan').
632643
633644
Returns
634645
-------
635646
ndarray
636-
A new array with the input array's elements casted.
647+
An array with the input array's elements casted to str or nan-like.
637648
"""
638649
cdef:
639-
object arr_i
640-
Py_ssize_t i, n = arr.size
641-
ndarray[object] result = np.empty(n, dtype=object)
642-
643-
for i in range(n):
644-
arr_i = arr[i]
650+
Py_ssize_t i = 0, n = len(arr)
645651

646-
if not (skipna and checknull(arr_i)):
647-
arr_i = str(arr_i)
652+
result = np.asarray(arr, dtype="object")
653+
if copy and result is arr:
654+
result = result.copy()
648655

649-
result[i] = arr_i
656+
for i in range(n):
657+
val = result[i]
658+
if not checknull(val):
659+
result[i] = str(val)
660+
else:
661+
if convert_na_value:
662+
val = na_value
663+
if skipna:
664+
result[i] = val
665+
else:
666+
result[i] = str(val)
650667

651668
return result
652669

pandas/_libs/tslibs/parsing.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default,
381381
object freq):
382382
cdef:
383383
object ret
384-
int year, quarter = -1, month, mnum, date_len
384+
# year initialized to prevent compiler warnings
385+
int year = -1, quarter = -1, month, mnum, date_len
385386

386387
# special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
387388
assert isinstance(date_string, str)

pandas/_testing.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1377,12 +1377,18 @@ def assert_series_equal(
13771377
)
13781378
elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype):
13791379
assert_extension_array_equal(
1380-
left._values, right._values, index_values=np.asarray(left.index)
1380+
left._values,
1381+
right._values,
1382+
check_dtype=check_dtype,
1383+
index_values=np.asarray(left.index),
13811384
)
13821385
elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
13831386
# DatetimeArray or TimedeltaArray
13841387
assert_extension_array_equal(
1385-
left._values, right._values, index_values=np.asarray(left.index)
1388+
left._values,
1389+
right._values,
1390+
check_dtype=check_dtype,
1391+
index_values=np.asarray(left.index),
13861392
)
13871393
else:
13881394
_testing.assert_almost_equal(

pandas/compat/_optional.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,25 @@
1111
"fsspec": "0.7.4",
1212
"fastparquet": "0.3.2",
1313
"gcsfs": "0.6.0",
14-
"lxml.etree": "3.8.0",
15-
"matplotlib": "2.2.2",
16-
"numexpr": "2.6.2",
14+
"lxml.etree": "4.3.0",
15+
"matplotlib": "2.2.3",
16+
"numexpr": "2.6.8",
1717
"odfpy": "1.3.0",
1818
"openpyxl": "2.5.7",
1919
"pandas_gbq": "0.12.0",
20-
"pyarrow": "0.13.0",
21-
"pytables": "3.4.3",
20+
"pyarrow": "0.15.0",
21+
"pytables": "3.4.4",
2222
"pytest": "5.0.1",
2323
"pyxlsb": "1.0.6",
2424
"s3fs": "0.4.0",
2525
"scipy": "1.2.0",
26-
"sqlalchemy": "1.1.4",
27-
"tables": "3.4.3",
26+
"sqlalchemy": "1.2.8",
27+
"tables": "3.4.4",
2828
"tabulate": "0.8.3",
29-
"xarray": "0.8.2",
29+
"xarray": "0.12.0",
3030
"xlrd": "1.2.0",
31-
"xlwt": "1.2.0",
32-
"xlsxwriter": "0.9.8",
31+
"xlwt": "1.3.0",
32+
"xlsxwriter": "1.0.2",
3333
"numba": "0.46.0",
3434
}
3535

pandas/core/arrays/string_.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,10 @@ class StringArray(PandasArray):
177177

178178
def __init__(self, values, copy=False):
179179
values = extract_array(values)
180-
skip_validation = isinstance(values, type(self))
181180

182181
super().__init__(values, copy=copy)
183182
self._dtype = StringDtype()
184-
if not skip_validation:
183+
if not isinstance(values, type(self)):
185184
self._validate()
186185

187186
def _validate(self):
@@ -200,23 +199,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
200199
assert dtype == "string"
201200

202201
result = np.asarray(scalars, dtype="object")
203-
if copy and result is scalars:
204-
result = result.copy()
205-
206-
# Standardize all missing-like values to NA
207-
# TODO: it would be nice to do this in _validate / lib.is_string_array
208-
# We are already doing a scan over the values there.
209-
na_values = isna(result)
210-
has_nans = na_values.any()
211-
if has_nans and result is scalars:
212-
# force a copy now, if we haven't already
213-
result = result.copy()
214-
215-
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
216-
result = np.asarray(result, dtype=str)
217-
result = np.asarray(result, dtype="object")
218-
if has_nans:
219-
result[na_values] = StringDtype.na_value
202+
203+
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
204+
result = lib.ensure_string_array(
205+
result, na_value=StringDtype.na_value, copy=copy
206+
)
220207

221208
return cls(result)
222209

pandas/core/dtypes/cast.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -916,7 +916,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
916916
dtype = pandas_dtype(dtype)
917917

918918
if issubclass(dtype.type, str):
919-
return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape)
919+
return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape)
920920

921921
elif is_datetime64_dtype(arr):
922922
if is_object_dtype(dtype):
@@ -1608,19 +1608,11 @@ def construct_1d_ndarray_preserving_na(
16081608
>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
16091609
array(['1.0', '2.0', None], dtype=object)
16101610
"""
1611-
subarr = np.array(values, dtype=dtype, copy=copy)
16121611

16131612
if dtype is not None and dtype.kind == "U":
1614-
# GH-21083
1615-
# We can't just return np.array(subarr, dtype='str') since
1616-
# NumPy will convert the non-string objects into strings
1617-
# Including NA values. Se we have to go
1618-
# string -> object -> update NA, which requires an
1619-
# additional pass over the data.
1620-
na_values = isna(values)
1621-
subarr2 = subarr.astype(object)
1622-
subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
1623-
subarr = subarr2
1613+
subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
1614+
else:
1615+
subarr = np.array(values, dtype=dtype, copy=copy)
16241616

16251617
return subarr
16261618

pandas/core/groupby/generic.py

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1111,6 +1111,7 @@ def blk_func(block: "Block") -> List["Block"]:
11111111
assert len(locs) == result.shape[1]
11121112
for i, loc in enumerate(locs):
11131113
agg_block = result.iloc[:, [i]]._mgr.blocks[0]
1114+
agg_block.mgr_locs = [loc]
11141115
new_blocks.append(agg_block)
11151116
else:
11161117
result = result._mgr.blocks[0].values
@@ -1124,7 +1125,6 @@ def blk_func(block: "Block") -> List["Block"]:
11241125
return new_blocks
11251126

11261127
skipped: List[int] = []
1127-
new_items: List[np.ndarray] = []
11281128
for i, block in enumerate(data.blocks):
11291129
try:
11301130
nbs = blk_func(block)
@@ -1136,33 +1136,13 @@ def blk_func(block: "Block") -> List["Block"]:
11361136
deleted_items.append(block.mgr_locs.as_array)
11371137
else:
11381138
agg_blocks.extend(nbs)
1139-
new_items.append(block.mgr_locs.as_array)
11401139

11411140
if not agg_blocks:
11421141
raise DataError("No numeric types to aggregate")
11431142

11441143
# reset the locs in the blocks to correspond to our
11451144
# current ordering
1146-
indexer = np.concatenate(new_items)
1147-
agg_items = data.items.take(np.sort(indexer))
1148-
1149-
if deleted_items:
1150-
1151-
# we need to adjust the indexer to account for the
1152-
# items we have removed
1153-
# really should be done in internals :<
1154-
1155-
deleted = np.concatenate(deleted_items)
1156-
ai = np.arange(len(data))
1157-
mask = np.zeros(len(data))
1158-
mask[deleted] = 1
1159-
indexer = (ai - mask.cumsum())[indexer]
1160-
1161-
offset = 0
1162-
for blk in agg_blocks:
1163-
loc = len(blk.mgr_locs)
1164-
blk.mgr_locs = indexer[offset : (offset + loc)]
1165-
offset += loc
1145+
agg_items = data.reset_dropped_locs(agg_blocks, skipped)
11661146

11671147
return agg_blocks, agg_items
11681148

0 commit comments

Comments
 (0)