Skip to content

Commit 3aa9873

Browse files
authored
REF: de-duplicate Manager concat paths (#52771)
* REF: de-duplicate Manager concat paths * comment * mypy fixup
1 parent 233152d commit 3aa9873

File tree

2 files changed

+39
-55
lines changed

2 files changed

+39
-55
lines changed

pandas/core/internals/concat.py

Lines changed: 34 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272

7373

7474
def _concatenate_array_managers(
75-
mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
75+
mgrs: list[Manager], axes: list[Index], concat_axis: AxisInt
7676
) -> Manager:
7777
"""
7878
Concatenate array managers into one.
@@ -82,27 +82,11 @@ def _concatenate_array_managers(
8282
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
8383
axes : list of Index
8484
concat_axis : int
85-
copy : bool
8685
8786
Returns
8887
-------
8988
ArrayManager
9089
"""
91-
# reindex all arrays
92-
mgrs = []
93-
for mgr, indexers in mgrs_indexers:
94-
axis1_made_copy = False
95-
for ax, indexer in indexers.items():
96-
mgr = mgr.reindex_indexer(
97-
axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
98-
)
99-
if ax == 1 and indexer is not None:
100-
axis1_made_copy = True
101-
if copy and concat_axis == 0 and not axis1_made_copy:
102-
# for concat_axis 1 we will always get a copy through concat_arrays
103-
mgr = mgr.copy()
104-
mgrs.append(mgr)
105-
10690
if concat_axis == 1:
10791
# concatting along the rows -> concat the reindexed arrays
10892
# TODO(ArrayManager) doesn't yet preserve the correct dtype
@@ -192,9 +176,18 @@ def concatenate_managers(
192176
-------
193177
BlockManager
194178
"""
179+
180+
needs_copy = copy and concat_axis == 0
181+
195182
# TODO(ArrayManager) this assumes that all managers are of the same type
196183
if isinstance(mgrs_indexers[0][0], ArrayManager):
197-
return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
184+
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
185+
# error: Argument 1 to "_concatenate_array_managers" has incompatible
186+
# type "List[BlockManager]"; expected "List[Union[ArrayManager,
187+
# SingleArrayManager, BlockManager, SingleBlockManager]]"
188+
return _concatenate_array_managers(
189+
mgrs, axes, concat_axis # type: ignore[arg-type]
190+
)
198191

199192
# Assertions disabled for performance
200193
# for tup in mgrs_indexers:
@@ -203,7 +196,8 @@ def concatenate_managers(
203196
# assert concat_axis not in indexers
204197

205198
if concat_axis == 0:
206-
return _concat_managers_axis0(mgrs_indexers, axes, copy)
199+
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
200+
return _concat_managers_axis0(mgrs, axes)
207201

208202
if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:
209203
first_dtype = mgrs_indexers[0][0].blocks[0].dtype
@@ -220,19 +214,15 @@ def concatenate_managers(
220214
nb = _concat_homogeneous_fastpath(mgrs_indexers, shape, first_dtype)
221215
return BlockManager((nb,), axes)
222216

223-
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
224-
if len(mgrs_indexers) == 1:
225-
mgr, indexers = mgrs_indexers[0]
226-
# Assertion correct but disabled for perf:
227-
# assert not indexers
228-
if copy:
229-
out = mgr.copy(deep=True)
230-
else:
231-
out = mgr.copy(deep=False)
217+
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
218+
219+
if len(mgrs) == 1:
220+
mgr = mgrs[0]
221+
out = mgr.copy(deep=False)
232222
out.axes = axes
233223
return out
234224

235-
concat_plan = _get_combined_plan([mgr for mgr, _ in mgrs_indexers])
225+
concat_plan = _get_combined_plan(mgrs)
236226

237227
blocks = []
238228
values: ArrayLike
@@ -277,35 +267,20 @@ def concatenate_managers(
277267
return BlockManager(tuple(blocks), axes)
278268

279269

280-
def _concat_managers_axis0(
281-
mgrs_indexers, axes: list[Index], copy: bool
282-
) -> BlockManager:
270+
def _concat_managers_axis0(mgrs: list[BlockManager], axes: list[Index]) -> BlockManager:
283271
"""
284272
concat_managers specialized to concat_axis=0, with reindexing already
285273
having been done in _maybe_reindex_columns_na_proxy.
286274
"""
287-
had_reindexers = {
288-
i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers))
289-
}
290-
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
291-
292-
mgrs: list[BlockManager] = [x[0] for x in mgrs_indexers]
293275

294276
offset = 0
295277
blocks: list[Block] = []
296278
for i, mgr in enumerate(mgrs):
297-
# If we already reindexed, then we definitely don't need another copy
298-
made_copy = had_reindexers[i]
299-
300279
for blk in mgr.blocks:
301-
if made_copy:
302-
nb = blk.copy(deep=False)
303-
elif copy:
304-
nb = blk.copy()
305-
else:
306-
# by slicing instead of copy(deep=False), we get a new array
307-
# object, see test_concat_copy
308-
nb = blk.getitem_block(slice(None))
280+
# We need to do getitem_block here otherwise we would be altering
281+
# blk.mgr_locs in place, which would render it invalid. This is only
282+
# relevant in the copy=False case.
283+
nb = blk.getitem_block(slice(None))
309284
nb._mgr_locs = nb._mgr_locs.add(offset)
310285
blocks.append(nb)
311286

@@ -316,16 +291,18 @@ def _concat_managers_axis0(
316291

317292

318293
def _maybe_reindex_columns_na_proxy(
319-
axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
320-
) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
294+
axes: list[Index],
295+
mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]],
296+
needs_copy: bool,
297+
) -> list[BlockManager]:
321298
"""
322299
Reindex along columns so that all of the BlockManagers being concatenated
323300
have matching columns.
324301
325302
Columns added in this reindexing have dtype=np.void, indicating they
326303
should be ignored when choosing a column's final dtype.
327304
"""
328-
new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = []
305+
new_mgrs = []
329306

330307
for mgr, indexers in mgrs_indexers:
331308
# For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
@@ -340,8 +317,11 @@ def _maybe_reindex_columns_na_proxy(
340317
allow_dups=True,
341318
use_na_proxy=True, # only relevant for i==0
342319
)
343-
new_mgrs_indexers.append((mgr, {}))
344-
return new_mgrs_indexers
320+
if needs_copy and not indexers:
321+
mgr = mgr.copy()
322+
323+
new_mgrs.append(mgr)
324+
return new_mgrs
345325

346326

347327
def _is_homogeneous_mgr(mgr: BlockManager, first_dtype: DtypeObj) -> bool:

pandas/tests/reshape/concat/test_concat.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write):
6161

6262
if not using_copy_on_write:
6363
for arr in result._mgr.arrays:
64-
assert arr.base is None
64+
assert not any(
65+
np.shares_memory(arr, y)
66+
for x in [df, df2, df3]
67+
for y in x._mgr.arrays
68+
)
6569
else:
6670
for arr in result._mgr.arrays:
6771
assert arr.base is not None

0 commit comments

Comments
 (0)