souris-dev · souris-dev · Aug 18, 2020 · Aug 17, 2020 · Aug 17, 2020 · Aug 17, 2020
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -7,6 +7,21 @@
 from .pandas_vb_common import tm
 
 
+class Construction:
+
+    params = ["str", "string"]
+    param_names = ["dtype"]
+
+    def setup(self, dtype):
+        self.data = tm.rands_array(nchars=10 ** 5, size=10)
+
+    def time_construction(self, dtype):
+        Series(self.data, dtype=dtype)
+
+    def peakmem_construction(self, dtype):
+        Series(self.data, dtype=dtype)
+
+
 class Methods:
     def setup(self):
         self.s = Series(tm.makeStringIndex(10 ** 5))

diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst
@@ -27,6 +27,7 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`)
 - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`)
 - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`)
+- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`)
 - Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`)
 
 .. ---------------------------------------------------------------------------
@@ -37,6 +38,7 @@ Bug fixes
 ~~~~~~~~~
 
 - Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`).
+- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`).
 
 Categorical
 ^^^^^^^^^^^
@@ -75,6 +77,11 @@ Categorical
 - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`)
 -
 
+**Strings**
+
+- fix memory usage issue when instantiating large  :class:`pandas.arrays.StringArray` (:issue:`35499`)
+
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_111.contributors:

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]:
-    """
-    Convert all elements in an array to string.
+cpdef ndarray[object] ensure_string_array(
+        arr,
+        object na_value=np.nan,
+        bint convert_na_value=True,
+        bint copy=True,
+        bint skipna=True,
+):
+    """Returns a new numpy array with object dtype and only strings and na values.
 
     Parameters
     ----------
-    arr : ndarray
-        The array whose elements we are casting.
-    skipna : bool, default False
+    arr : array-like
+        The values to be converted to str, if needed.
+    na_value : Any
+        The value to use for na. For example, np.nan or pd.NA.
+    convert_na_value : bool, default True
+        If False, existing na values will be used unchanged in the new array.
+    copy : bool, default True
+        Whether to ensure that a new array is returned.
+    skipna : bool, default True
         Whether or not to coerce nulls to their stringified form
-        (e.g. NaN becomes 'nan').
+        (e.g. if False, NaN becomes 'nan').
 
     Returns
     -------
     ndarray
-        A new array with the input array's elements casted.
+        An array with the input array's elements casted to str or nan-like.
     """
     cdef:
-        object arr_i
-        Py_ssize_t i, n = arr.size
-        ndarray[object] result = np.empty(n, dtype=object)
-
-    for i in range(n):
-        arr_i = arr[i]
+        Py_ssize_t i = 0, n = len(arr)
 
-        if not (skipna and checknull(arr_i)):
-            arr_i = str(arr_i)
+    result = np.asarray(arr, dtype="object")
+    if copy and result is arr:
+        result = result.copy()
 
-        result[i] = arr_i
+    for i in range(n):
+        val = result[i]
+        if not checknull(val):
+            result[i] = str(val)
+        else:
+            if convert_na_value:
+                val = na_value
+            if skipna:
+                result[i] = val
+            else:
+                result[i] = str(val)
 
     return result
 

diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -381,7 +381,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default,
                                           object freq):
     cdef:
         object ret
-        int year, quarter = -1, month, mnum, date_len
+        # year initialized to prevent compiler warnings
+        int year = -1, quarter = -1, month, mnum, date_len
 
     # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
     assert isinstance(date_string, str)

diff --git a/pandas/_testing.py b/pandas/_testing.py
@@ -1377,12 +1377,18 @@ def assert_series_equal(
         )
     elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype):
         assert_extension_array_equal(
-            left._values, right._values, index_values=np.asarray(left.index)
+            left._values,
+            right._values,
+            check_dtype=check_dtype,
+            index_values=np.asarray(left.index),
         )
     elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
         # DatetimeArray or TimedeltaArray
         assert_extension_array_equal(
-            left._values, right._values, index_values=np.asarray(left.index)
+            left._values,
+            right._values,
+            check_dtype=check_dtype,
+            index_values=np.asarray(left.index),
         )
     else:
         _testing.assert_almost_equal(

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -11,25 +11,25 @@
     "fsspec": "0.7.4",
     "fastparquet": "0.3.2",
     "gcsfs": "0.6.0",
-    "lxml.etree": "3.8.0",
-    "matplotlib": "2.2.2",
-    "numexpr": "2.6.2",
+    "lxml.etree": "4.3.0",
+    "matplotlib": "2.2.3",
+    "numexpr": "2.6.8",
     "odfpy": "1.3.0",
     "openpyxl": "2.5.7",
     "pandas_gbq": "0.12.0",
-    "pyarrow": "0.13.0",
-    "pytables": "3.4.3",
+    "pyarrow": "0.15.0",
+    "pytables": "3.4.4",
     "pytest": "5.0.1",
     "pyxlsb": "1.0.6",
     "s3fs": "0.4.0",
     "scipy": "1.2.0",
-    "sqlalchemy": "1.1.4",
-    "tables": "3.4.3",
+    "sqlalchemy": "1.2.8",
+    "tables": "3.4.4",
     "tabulate": "0.8.3",
-    "xarray": "0.8.2",
+    "xarray": "0.12.0",
     "xlrd": "1.2.0",
-    "xlwt": "1.2.0",
-    "xlsxwriter": "0.9.8",
+    "xlwt": "1.3.0",
+    "xlsxwriter": "1.0.2",
     "numba": "0.46.0",
 }
 

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -177,11 +177,10 @@ class StringArray(PandasArray):
 
     def __init__(self, values, copy=False):
         values = extract_array(values)
-        skip_validation = isinstance(values, type(self))
 
         super().__init__(values, copy=copy)
         self._dtype = StringDtype()
-        if not skip_validation:
+        if not isinstance(values, type(self)):
             self._validate()
 
     def _validate(self):
@@ -200,23 +199,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
             assert dtype == "string"
 
         result = np.asarray(scalars, dtype="object")
-        if copy and result is scalars:
-            result = result.copy()
-
-        # Standardize all missing-like values to NA
-        # TODO: it would be nice to do this in _validate / lib.is_string_array
-        # We are already doing a scan over the values there.
-        na_values = isna(result)
-        has_nans = na_values.any()
-        if has_nans and result is scalars:
-            # force a copy now, if we haven't already
-            result = result.copy()
-
-        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
-        result = np.asarray(result, dtype=str)
-        result = np.asarray(result, dtype="object")
-        if has_nans:
-            result[na_values] = StringDtype.na_value
+
+        # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+        result = lib.ensure_string_array(
+            result, na_value=StringDtype.na_value, copy=copy
+        )
 
         return cls(result)
 

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -916,7 +916,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
         dtype = pandas_dtype(dtype)
 
     if issubclass(dtype.type, str):
-        return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape)
+        return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape)
 
     elif is_datetime64_dtype(arr):
         if is_object_dtype(dtype):
@@ -1608,19 +1608,11 @@ def construct_1d_ndarray_preserving_na(
     >>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
     array(['1.0', '2.0', None], dtype=object)
     """
-    subarr = np.array(values, dtype=dtype, copy=copy)
 
     if dtype is not None and dtype.kind == "U":
-        # GH-21083
-        # We can't just return np.array(subarr, dtype='str') since
-        # NumPy will convert the non-string objects into strings
-        # Including NA values. Se we have to go
-        # string -> object -> update NA, which requires an
-        # additional pass over the data.
-        na_values = isna(values)
-        subarr2 = subarr.astype(object)
-        subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
-        subarr = subarr2
+        subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
+    else:
+        subarr = np.array(values, dtype=dtype, copy=copy)
 
     return subarr
 

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1111,6 +1111,7 @@ def blk_func(block: "Block") -> List["Block"]:
                         assert len(locs) == result.shape[1]
                         for i, loc in enumerate(locs):
                             agg_block = result.iloc[:, [i]]._mgr.blocks[0]
+                            agg_block.mgr_locs = [loc]
                             new_blocks.append(agg_block)
                     else:
                         result = result._mgr.blocks[0].values
@@ -1124,7 +1125,6 @@ def blk_func(block: "Block") -> List["Block"]:
             return new_blocks
 
         skipped: List[int] = []
-        new_items: List[np.ndarray] = []
         for i, block in enumerate(data.blocks):
             try:
                 nbs = blk_func(block)
@@ -1136,33 +1136,13 @@ def blk_func(block: "Block") -> List["Block"]:
                 deleted_items.append(block.mgr_locs.as_array)
             else:
                 agg_blocks.extend(nbs)
-                new_items.append(block.mgr_locs.as_array)
 
         if not agg_blocks:
             raise DataError("No numeric types to aggregate")
 
         # reset the locs in the blocks to correspond to our
         # current ordering
-        indexer = np.concatenate(new_items)
-        agg_items = data.items.take(np.sort(indexer))
-
-        if deleted_items:
-
-            # we need to adjust the indexer to account for the
-            # items we have removed
-            # really should be done in internals :<
-
-            deleted = np.concatenate(deleted_items)
-            ai = np.arange(len(data))
-            mask = np.zeros(len(data))
-            mask[deleted] = 1
-            indexer = (ai - mask.cumsum())[indexer]
-
-        offset = 0
-        for blk in agg_blocks:
-            loc = len(blk.mgr_locs)
-            blk.mgr_locs = indexer[offset : (offset + loc)]
-            offset += loc
+        agg_items = data.reset_dropped_locs(agg_blocks, skipped)
 
         return agg_blocks, agg_items