From 579ad355beaf69900c038f2a2f0d6427aa0fb488 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Nov 2021 15:15:30 +0100 Subject: [PATCH 1/4] PERF: avoid copy in concatenate_array_managers if reindex already copies --- pandas/core/internals/concat.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7687e60db8552..016e0c1505b00 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -76,12 +76,17 @@ def _concatenate_array_managers( """ # reindex all arrays mgrs = [] + axis1_needs_copy = False for mgr, indexers in mgrs_indexers: + axis1_needs_copy_this = True for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer( axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True ) + if ax == 1 and indexer is not None: + axis1_needs_copy_this = False mgrs.append(mgr) + axis1_needs_copy = axis1_needs_copy or axis1_needs_copy_this if concat_axis == 1: # concatting along the rows -> concat the reindexed arrays @@ -94,7 +99,7 @@ def _concatenate_array_managers( # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - if copy: + if copy and axis1_needs_copy_this: arrays = [x.copy() for x in arrays] new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) From 995bd1c54af6f70d57fe107ad33227c4d2c1fd1e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 3 Dec 2021 15:04:14 +0100 Subject: [PATCH 2/4] simplify --- pandas/core/internals/array_manager.py | 2 +- pandas/core/internals/concat.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1cd9fe65407ba..7606d966e278a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -528,7 +528,7 @@ def copy_func(ax): new_arrays = [arr.copy() for arr in self.arrays] else: new_arrays = self.arrays - return type(self)(new_arrays, new_axes) + return type(self)(new_arrays, new_axes, verify_integrity=False) def reindex_indexer( self: T, diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 016e0c1505b00..f4ae778f7169b 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -76,17 +76,18 @@ def _concatenate_array_managers( """ # reindex all arrays mgrs = [] - axis1_needs_copy = False for mgr, indexers in mgrs_indexers: - axis1_needs_copy_this = True + axis1_needs_copy = True for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer( axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True ) if ax == 1 and indexer is not None: - axis1_needs_copy_this = False + axis1_needs_copy = False + if copy and concat_axis == 0 and axis1_needs_copy: + # for concat_axis 1 we will always get a copy through concat_arrays + mgr = mgr.copy() mgrs.append(mgr) - axis1_needs_copy = axis1_needs_copy or axis1_needs_copy_this if concat_axis == 1: # concatting along the rows -> concat the reindexed arrays @@ -99,8 +100,6 @@ def _concatenate_array_managers( # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - if copy and axis1_needs_copy_this: - arrays = [x.copy() for x in arrays] new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) return new_mgr From 5977399d267439dac34c19dee9c966875867e352 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 4 Dec 2021 09:10:41 +0100 Subject: [PATCH 3/4] needs_copy -> made_copy --- pandas/core/internals/concat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f4ae778f7169b..31f6493fab62c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -77,14 +77,14 @@ def _concatenate_array_managers( # reindex all arrays mgrs = [] for mgr, indexers in mgrs_indexers: - axis1_needs_copy = True + axis1_made_copy = False for ax, indexer in indexers.items(): mgr = mgr.reindex_indexer( axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True ) if ax == 1 and indexer is not None: - axis1_needs_copy = False - if copy and concat_axis == 0 and axis1_needs_copy: + axis1_made_copy = True + if copy and concat_axis == 0 and not axis1_made_copy: # for concat_axis 1 we will always get a copy through concat_arrays mgr = mgr.copy() mgrs.append(mgr) From 93d12fc9d9430fe3ad36e1da35b23808fc2903a7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Dec 2021 10:44:26 +0100 Subject: [PATCH 4/4] fixup copy --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 26a9af04b4a00..598974979fefb 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -527,7 +527,7 @@ def copy_func(ax): if deep: new_arrays = [arr.copy() for arr in self.arrays] else: - new_arrays = self.arrays + new_arrays = list(self.arrays) return type(self)(new_arrays, new_axes, verify_integrity=False) def reindex_indexer(