From 9103ba4392da4acdeab22d7c9d30f76a45e787ae Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 Apr 2023 16:20:46 -0700 Subject: [PATCH 1/3] REF: de-duplicate Manager concat paths --- pandas/core/internals/concat.py | 69 +++++++--------------- pandas/tests/reshape/concat/test_concat.py | 6 +- 2 files changed, 26 insertions(+), 49 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2f66fdae8ded5..f4bb5bd6c0b30 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -69,7 +69,7 @@ def _concatenate_array_managers( - mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool + mgrs: list[Manager], axes: list[Index], concat_axis: AxisInt ) -> Manager: """ Concatenate array managers into one. @@ -79,27 +79,11 @@ def _concatenate_array_managers( mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int - copy : bool Returns ------- ArrayManager """ - # reindex all arrays - mgrs = [] - for mgr, indexers in mgrs_indexers: - axis1_made_copy = False - for ax, indexer in indexers.items(): - mgr = mgr.reindex_indexer( - axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True - ) - if ax == 1 and indexer is not None: - axis1_made_copy = True - if copy and concat_axis == 0 and not axis1_made_copy: - # for concat_axis 1 we will always get a copy through concat_arrays - mgr = mgr.copy() - mgrs.append(mgr) - if concat_axis == 1: # concatting along the rows -> concat the reindexed arrays # TODO(ArrayManager) doesn't yet preserve the correct dtype @@ -189,9 +173,13 @@ def concatenate_managers( ------- BlockManager """ + + needs_copy = copy and concat_axis == 0 + mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) + # TODO(ArrayManager) this assumes that all managers are of the same type - if isinstance(mgrs_indexers[0][0], ArrayManager): - return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) + if isinstance(mgrs[0], ArrayManager): + return _concatenate_array_managers(mgrs, axes, concat_axis) # Assertions disabled for performance # for tup in mgrs_indexers: @@ -200,11 +188,9 @@ def concatenate_managers( # assert concat_axis not in indexers if concat_axis == 0: - return _concat_managers_axis0(mgrs_indexers, axes, copy) - - mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) + return _concat_managers_axis0(mgrs, axes) - concat_plan = _get_combined_plan([mgr for mgr, _ in mgrs_indexers]) + concat_plan = _get_combined_plan(mgrs) blocks = [] @@ -255,35 +241,17 @@ def concatenate_managers( return BlockManager(tuple(blocks), axes) -def _concat_managers_axis0( - mgrs_indexers, axes: list[Index], copy: bool -) -> BlockManager: +def _concat_managers_axis0(mgrs: list[BlockManager], axes: list[Index]) -> BlockManager: """ concat_managers specialized to concat_axis=0, with reindexing already having been done in _maybe_reindex_columns_na_proxy. """ - had_reindexers = { - i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers)) - } - mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) - - mgrs: list[BlockManager] = [x[0] for x in mgrs_indexers] offset = 0 blocks: list[Block] = [] for i, mgr in enumerate(mgrs): - # If we already reindexed, then we definitely don't need another copy - made_copy = had_reindexers[i] - for blk in mgr.blocks: - if made_copy: - nb = blk.copy(deep=False) - elif copy: - nb = blk.copy() - else: - # by slicing instead of copy(deep=False), we get a new array - # object, see test_concat_copy - nb = blk.getitem_block(slice(None)) + nb = blk.getitem_block(slice(None)) nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) @@ -294,8 +262,10 @@ def _concat_managers_axis0( def _maybe_reindex_columns_na_proxy( - axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] -) -> list[tuple[BlockManager, dict[int, np.ndarray]]]: + axes: list[Index], + mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]], + needs_copy: bool, +) -> list[BlockManager]: """ Reindex along columns so that all of the BlockManagers being concatenated have matching columns. @@ -303,7 +273,7 @@ def _maybe_reindex_columns_na_proxy( Columns added in this reindexing have dtype=np.void, indicating they should be ignored when choosing a column's final dtype. """ - new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = [] + new_mgrs = [] for mgr, indexers in mgrs_indexers: # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this @@ -318,8 +288,11 @@ def _maybe_reindex_columns_na_proxy( allow_dups=True, use_na_proxy=True, # only relevant for i==0 ) - new_mgrs_indexers.append((mgr, {})) - return new_mgrs_indexers + if needs_copy and not indexers: + mgr = mgr.copy() + + new_mgrs.append(mgr) + return new_mgrs def _get_combined_plan( diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 41492d7df53da..ef02e9f7a465a 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -61,7 +61,11 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write): if not using_copy_on_write: for arr in result._mgr.arrays: - assert arr.base is None + assert not any( + np.shares_memory(arr, y) + for x in [df, df2, df3] + for y in x._mgr.arrays + ) else: for arr in result._mgr.arrays: assert arr.base is not None From a1dedb4f7c0e6808528a7a8789baa9b599353554 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 Apr 2023 16:23:52 -0700 Subject: [PATCH 2/3] comment --- pandas/core/internals/concat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f4bb5bd6c0b30..001a2eb932605 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -251,6 +251,9 @@ def _concat_managers_axis0(mgrs: list[BlockManager], axes: list[Index]) -> Block blocks: list[Block] = [] for i, mgr in enumerate(mgrs): for blk in mgr.blocks: + # We need to do getitem_block here otherwise we would be altering + # blk.mgr_locs in place, which would render it invalid. This is only + # relevant in the copy=False case. nb = blk.getitem_block(slice(None)) nb._mgr_locs = nb._mgr_locs.add(offset) blocks.append(nb) From ee83630a179f5802c27a4fc021370545ce5e3fbe Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 19 Apr 2023 13:25:54 -0700 Subject: [PATCH 3/3] mypy fixup --- pandas/core/internals/concat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 38e036fa78e28..d012a445bf834 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -181,7 +181,12 @@ def concatenate_managers( # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) - return _concatenate_array_managers(mgrs, axes, concat_axis) + # error: Argument 1 to "_concatenate_array_managers" has incompatible + # type "List[BlockManager]"; expected "List[Union[ArrayManager, + # SingleArrayManager, BlockManager, SingleBlockManager]]" + return _concatenate_array_managers( + mgrs, axes, concat_axis # type: ignore[arg-type] + ) # Assertions disabled for performance # for tup in mgrs_indexers: