diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d7fb2775376c0..2023858181baa 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -7,6 +7,21 @@ from .pandas_vb_common import tm +class Construction: + + params = ["str", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + self.data = tm.rands_array(nchars=10 ** 5, size=10) + + def time_construction(self, dtype): + Series(self.data, dtype=dtype) + + def peakmem_construction(self, dtype): + Series(self.data, dtype=dtype) + + class Methods: def setup(self): self.s = Series(tm.makeStringIndex(10 ** 5)) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index b1fd76157b9f1..ff5bbccf63ffe 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) - Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) .. --------------------------------------------------------------------------- @@ -37,6 +38,7 @@ Bug fixes ~~~~~~~~~ - Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`). Categorical ^^^^^^^^^^^ @@ -75,6 +77,11 @@ Categorical - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) - +**Strings** + +- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) + + .. --------------------------------------------------------------------------- .. _whatsnew_111.contributors: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5fa91ffee8ea8..eadfcefaac73d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) -def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: - """ - Convert all elements in an array to string. +cpdef ndarray[object] ensure_string_array( + arr, + object na_value=np.nan, + bint convert_na_value=True, + bint copy=True, + bint skipna=True, +): + """Returns a new numpy array with object dtype and only strings and na values. Parameters ---------- - arr : ndarray - The array whose elements we are casting. - skipna : bool, default False + arr : array-like + The values to be converted to str, if needed. + na_value : Any + The value to use for na. For example, np.nan or pd.NA. + convert_na_value : bool, default True + If False, existing na values will be used unchanged in the new array. + copy : bool, default True + Whether to ensure that a new array is returned. + skipna : bool, default True Whether or not to coerce nulls to their stringified form - (e.g. NaN becomes 'nan'). + (e.g. if False, NaN becomes 'nan'). Returns ------- ndarray - A new array with the input array's elements casted. + An array with the input array's elements casted to str or nan-like. """ cdef: - object arr_i - Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) - - for i in range(n): - arr_i = arr[i] + Py_ssize_t i = 0, n = len(arr) - if not (skipna and checknull(arr_i)): - arr_i = str(arr_i) + result = np.asarray(arr, dtype="object") + if copy and result is arr: + result = result.copy() - result[i] = arr_i + for i in range(n): + val = result[i] + if not checknull(val): + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = str(val) return result diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8429aebbd85b8..7478179df3b75 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -381,7 +381,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default, object freq): cdef: object ret - int year, quarter = -1, month, mnum, date_len + # year initialized to prevent compiler warnings + int year = -1, quarter = -1, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 assert isinstance(date_string, str) diff --git a/pandas/_testing.py b/pandas/_testing.py index 713f29466f097..ef6232fa6d575 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1377,12 +1377,18 @@ def assert_series_equal( ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 81eac490fe5b9..689c7c889ef66 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -11,25 +11,25 @@ "fsspec": "0.7.4", "fastparquet": "0.3.2", "gcsfs": "0.6.0", - "lxml.etree": "3.8.0", - "matplotlib": "2.2.2", - "numexpr": "2.6.2", + "lxml.etree": "4.3.0", + "matplotlib": "2.2.3", + "numexpr": "2.6.8", "odfpy": "1.3.0", "openpyxl": "2.5.7", "pandas_gbq": "0.12.0", - "pyarrow": "0.13.0", - "pytables": "3.4.3", + "pyarrow": "0.15.0", + "pytables": "3.4.4", "pytest": "5.0.1", "pyxlsb": "1.0.6", "s3fs": "0.4.0", "scipy": "1.2.0", - "sqlalchemy": "1.1.4", - "tables": "3.4.3", + "sqlalchemy": "1.2.8", + "tables": "3.4.4", "tabulate": "0.8.3", - "xarray": "0.8.2", + "xarray": "0.12.0", "xlrd": "1.2.0", - "xlwt": "1.2.0", - "xlsxwriter": "0.9.8", + "xlwt": "1.3.0", + "xlsxwriter": "1.0.2", "numba": "0.46.0", } diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bb55c3cdea45c..381968f9724b6 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -177,11 +177,10 @@ class StringArray(PandasArray): def __init__(self, values, copy=False): values = extract_array(values) - skip_validation = isinstance(values, type(self)) super().__init__(values, copy=copy) self._dtype = StringDtype() - if not skip_validation: + if not isinstance(values, type(self)): self._validate() def _validate(self): @@ -200,23 +199,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): assert dtype == "string" result = np.asarray(scalars, dtype="object") - if copy and result is scalars: - result = result.copy() - - # Standardize all missing-like values to NA - # TODO: it would be nice to do this in _validate / lib.is_string_array - # We are already doing a scan over the values there. - na_values = isna(result) - has_nans = na_values.any() - if has_nans and result is scalars: - # force a copy now, if we haven't already - result = result.copy() - - # convert to str, then to object to avoid dtype like '>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str')) array(['1.0', '2.0', None], dtype=object) """ - subarr = np.array(values, dtype=dtype, copy=copy) if dtype is not None and dtype.kind == "U": - # GH-21083 - # We can't just return np.array(subarr, dtype='str') since - # NumPy will convert the non-string objects into strings - # Including NA values. Se we have to go - # string -> object -> update NA, which requires an - # additional pass over the data. - na_values = isna(values) - subarr2 = subarr.astype(object) - subarr2[na_values] = np.asarray(values, dtype=object)[na_values] - subarr = subarr2 + subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + else: + subarr = np.array(values, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b806d9856d20f..1f0cdbd07560f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1111,6 +1111,7 @@ def blk_func(block: "Block") -> List["Block"]: assert len(locs) == result.shape[1] for i, loc in enumerate(locs): agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_block.mgr_locs = [loc] new_blocks.append(agg_block) else: result = result._mgr.blocks[0].values @@ -1124,7 +1125,6 @@ def blk_func(block: "Block") -> List["Block"]: return new_blocks skipped: List[int] = [] - new_items: List[np.ndarray] = [] for i, block in enumerate(data.blocks): try: nbs = blk_func(block) @@ -1136,33 +1136,13 @@ def blk_func(block: "Block") -> List["Block"]: deleted_items.append(block.mgr_locs.as_array) else: agg_blocks.extend(nbs) - new_items.append(block.mgr_locs.as_array) if not agg_blocks: raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering - indexer = np.concatenate(new_items) - agg_items = data.items.take(np.sort(indexer)) - - if deleted_items: - - # we need to adjust the indexer to account for the - # items we have removed - # really should be done in internals :< - - deleted = np.concatenate(deleted_items) - ai = np.arange(len(data)) - mask = np.zeros(len(data)) - mask[deleted] = 1 - indexer = (ai - mask.cumsum())[indexer] - - offset = 0 - for blk in agg_blocks: - loc = len(blk.mgr_locs) - blk.mgr_locs = indexer[offset : (offset + loc)] - offset += loc + agg_items = data.reset_dropped_locs(agg_blocks, skipped) return agg_blocks, agg_items diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 371b721f08b27..f05d4cf1c4be6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2,7 +2,17 @@ import itertools import operator import re -from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union +from typing import ( + DefaultDict, + Dict, + List, + Optional, + Pattern, + Sequence, + Tuple, + TypeVar, + Union, +) import warnings import numpy as np @@ -1494,6 +1504,38 @@ def unstack(self, unstacker, fill_value) -> "BlockManager": bm = BlockManager(new_blocks, [new_columns, new_index]) return bm + def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index: + """ + Decrement the mgr_locs of the given blocks with `skipped` removed. + + Notes + ----- + Alters each block's mgr_locs inplace. + """ + ncols = len(self) + + new_locs = [blk.mgr_locs.as_array for blk in blocks] + indexer = np.concatenate(new_locs) + + new_items = self.items.take(np.sort(indexer)) + + if skipped: + # we need to adjust the indexer to account for the + # items we have removed + deleted_items = [self.blocks[i].mgr_locs.as_array for i in skipped] + deleted = np.concatenate(deleted_items) + ai = np.arange(ncols) + mask = np.zeros(ncols) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for blk in blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] + offset += loc + return new_items + class SingleBlockManager(BlockManager): """ manage a single block with """ @@ -1907,7 +1949,10 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False, mask: Optional[ArrayLike] = None + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1918,7 +1963,7 @@ def _compare_or_regex_search( Parameters ---------- a : array_like - b : scalar + b : scalar or regex pattern regex : bool, default False mask : array_like or None (default) @@ -1928,7 +1973,7 @@ def _compare_or_regex_search( """ def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Scalar, + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern], ): """ Raises an error if the two arrays (a,b) cannot be compared. @@ -1949,7 +1994,7 @@ def _check_comparison_types( else: op = np.vectorize( lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, str) + if isinstance(x, str) and isinstance(b, (str, Pattern)) else False ) diff --git a/pandas/io/common.py b/pandas/io/common.py index 54f35e689aac8..d1305c9cabe0e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,6 +18,7 @@ Optional, Tuple, Type, + Union, ) from urllib.parse import ( urljoin, @@ -452,7 +453,7 @@ def get_handle( except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) - handles: List[IO] = list() + handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string @@ -535,6 +536,8 @@ def get_handle( try: wrapped = _MMapWrapper(f) f.close() + handles.remove(f) + handles.append(wrapped) f = wrapped except Exception: # we catch any errors that may have occurred diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 6f9a1a5be4c43..efd5d29ae0717 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -206,12 +206,16 @@ def test_constructor_raises(): @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy): - a = np.array(["a", np.nan], dtype=object) - original = a.copy() - result = pd.arrays.StringArray._from_sequence(a, copy=copy) - expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + nan_arr = np.array(["a", np.nan], dtype=object) + na_arr = np.array(["a", pd.NA], dtype=object) + + result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy) + expected = pd.arrays.StringArray(na_arr) + tm.assert_extension_array_equal(result, expected) - tm.assert_numpy_array_equal(a, original) + + expected = nan_arr if copy else na_arr + tm.assert_numpy_array_equal(nan_arr, expected) def test_astype_int(): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a3f056dbf9648..8603bff0587b6 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1573,3 +1573,11 @@ def test_replace_dict_category_type(self, input_category_df, expected_category_d result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + df = pd.DataFrame(["a", "b", "c"]) + regex = re.compile("^a$") + result = df.replace({regex: "z"}, regex=True) + expected = pd.DataFrame(["z", "b", "c"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 3d5f6ae3a4af9..1d8d5a29686a4 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1836,6 +1836,7 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) +@td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 509ae89909699..b30a7b1ef34de 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -46,6 +46,21 @@ def check_compressed_urls(salaries_table, compression, extension, mode, engine): tm.assert_frame_equal(url_table, salaries_table) +@tm.network("https://raw.githubusercontent.com/", check_before_test=True) +def test_url_encoding_csv(): + """ + read_csv should honor the requested encoding for URLs. + + GH 10424 + """ + path = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + + "pandas/tests/io/parser/data/unicode_series.csv" + ) + df = read_csv(path, encoding="latin-1", header=None) + assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)" + + @pytest.fixture def tips_df(datapath): """DataFrame with the tips dataset.""" diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 29b787d39c09d..a7e3162ed7b73 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -263,7 +263,8 @@ def _get_all_tables(self): return table_list def _close_conn(self): - pass + # https://docs.sqlalchemy.org/en/13/core/connections.html#engine-disposal + self.conn.dispose() class PandasSQLTest: @@ -1242,7 +1243,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): def setup_class(cls): cls.setup_import() cls.setup_driver() - conn = cls.connect() + conn = cls.conn = cls.connect() conn.connect() def load_test_data_and_sql(self): diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 11802c59a29da..f78a28c66e946 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -415,3 +417,11 @@ def test_replace_extension_other(self): # https://github.com/pandas-dev/pandas/issues/34530 ser = pd.Series(pd.array([1, 2, 3], dtype="Int64")) ser.replace("", "") # no exception + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + s = pd.Series(["a", "b", "c"]) + regex = re.compile("^a$") + result = s.replace({regex: "z"}, regex=True) + expected = pd.Series(["z", "b", "c"]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index d9fdf1491c328..f9259beab5d13 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas import array import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -102,3 +103,11 @@ def test_assert_extension_array_equal_non_extension_array(side): with pytest.raises(AssertionError, match=msg): tm.assert_extension_array_equal(*args) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = array([1, 2, 3], dtype="Int64") + right = array([1, 2, 3], dtype=right_dtype) + tm.assert_extension_array_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index fe3e1ff906919..3aa3c64923b14 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -260,3 +260,11 @@ def test_assert_frame_equal_interval_dtype_mismatch(): with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index a7b5aeac560e4..f3c66052b1904 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -296,3 +296,11 @@ def test_series_equal_exact_for_nonnumeric(): tm.assert_series_equal(s1, s3, check_exact=True) with pytest.raises(AssertionError): tm.assert_series_equal(s3, s1, check_exact=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.Series([1, 2, 3], dtype="Int64") + right = pd.Series([1, 2, 3], dtype=right_dtype) + tm.assert_series_equal(left, right, check_dtype=False)