Skip to content

Commit 6cd1c4d

Browse files
[ArrayManager] Implement concat with reindexing
1 parent cb1486c commit 6cd1c4d

File tree

13 files changed

+256
-42
lines changed

13 files changed

+256
-42
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,4 @@ jobs:
157157
run: |
158158
source activate pandas-dev
159159
pytest pandas/tests/frame/methods --array-manager
160+
pytest pandas/tests/reshape/concat/ --array-manager

pandas/core/dtypes/concat.py

Lines changed: 158 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,19 @@
55

66
import numpy as np
77

8+
from pandas._libs import NaT, lib
89
from pandas._typing import ArrayLike, DtypeObj
910

1011
from pandas.core.dtypes.cast import find_common_type
1112
from pandas.core.dtypes.common import (
13+
is_bool_dtype,
1214
is_categorical_dtype,
15+
is_datetime64_ns_dtype,
1316
is_dtype_equal,
1417
is_extension_array_dtype,
18+
is_integer_dtype,
1519
is_sparse,
20+
is_timedelta64_ns_dtype,
1621
)
1722
from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries
1823

@@ -21,11 +26,78 @@
2126
from pandas.core.construction import array, ensure_wrapped_if_datetimelike
2227

2328

29+
class NullArrayProxy:
30+
"""
31+
Proxy object for an all-NA array.
32+
33+
Only stores the length of the array, and not the dtype. The dtype
34+
will only be known when actually concatenating (after determining the
35+
common dtype, for which this proxy is ignored).
36+
Using this object avoids that the internals/concat.py needs to determine
37+
the proper dtype and array type.
38+
"""
39+
40+
ndim = 1
41+
42+
def __init__(self, n: int):
43+
self.n = n
44+
45+
@property
46+
def shape(self):
47+
return (self.n,)
48+
49+
50+
def _array_from_proxy(arr, dtype: DtypeObj, fill_value=lib.no_default):
51+
"""
52+
Helper function to create the actual all-NA array from the NullArrayProxy object.
53+
54+
Parameters
55+
----------
56+
arr : NullArrayProxy
57+
dtype : the dtype for the resulting array
58+
fill_value : scalar NA-like value
59+
By default uses the ExtensionDtype's na_value or np.nan. For numpy
60+
arrays, this can be overridden to be something else (eg None).
61+
62+
Returns
63+
-------
64+
np.ndarray or ExtensionArray
65+
"""
66+
if is_extension_array_dtype(dtype):
67+
return dtype.construct_array_type()._from_sequence(
68+
[dtype.na_value] * arr.n, dtype=dtype
69+
)
70+
elif is_datetime64_ns_dtype(dtype):
71+
from pandas.core.arrays import DatetimeArray
72+
73+
return DatetimeArray._from_sequence([NaT] * arr.n, dtype=dtype)
74+
elif is_timedelta64_ns_dtype(dtype):
75+
from pandas.core.arrays import TimedeltaArray
76+
77+
return TimedeltaArray._from_sequence([NaT] * arr.n, dtype=dtype)
78+
else:
79+
if is_integer_dtype(dtype):
80+
dtype = "float64"
81+
fill_value = np.nan
82+
elif is_bool_dtype(dtype):
83+
dtype = object
84+
85+
if fill_value is lib.no_default:
86+
fill_value = np.nan
87+
88+
arr = np.empty(arr.n, dtype=dtype)
89+
arr.fill(fill_value)
90+
return arr
91+
92+
2493
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
2594
"""
2695
Helper function for `arr.astype(common_dtype)` but handling all special
2796
cases.
2897
"""
98+
if isinstance(arr, NullArrayProxy):
99+
return _array_from_proxy(arr, dtype)
100+
29101
if (
30102
is_categorical_dtype(arr.dtype)
31103
and isinstance(dtype, np.dtype)
@@ -132,6 +204,75 @@ def is_nonempty(x) -> bool:
132204
return np.concatenate(to_concat, axis=axis)
133205

134206

207+
def concat_arrays(to_concat):
208+
"""
209+
Alternative for concat_compat but specialized for use in the ArrayManager.
210+
211+
Differences: only deals with 1D arrays (no axis keyword) and does not skip
212+
empty arrays to determine the dtype.
213+
In addition ensures that all NullArrayProxies get replaced with actual
214+
arrays.
215+
216+
Parameters
217+
----------
218+
to_concat : list of arrays
219+
220+
Returns
221+
-------
222+
np.ndarray or ExtensionArray
223+
"""
224+
# ignore the all-NA proxies to determine the resulting dtype
225+
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
226+
227+
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
228+
single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
229+
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat_no_proxy)
230+
231+
if any_ea:
232+
if not single_dtype:
233+
target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy])
234+
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
235+
else:
236+
target_dtype = to_concat_no_proxy[0].dtype
237+
to_concat = [
238+
_array_from_proxy(arr, target_dtype)
239+
if isinstance(arr, NullArrayProxy)
240+
else arr
241+
for arr in to_concat
242+
]
243+
244+
if isinstance(to_concat[0], ExtensionArray):
245+
cls = type(to_concat[0])
246+
return cls._concat_same_type(to_concat)
247+
else:
248+
return np.concatenate(to_concat)
249+
250+
elif any(kind in ["m", "M"] for kind in kinds):
251+
return _concat_datetime(to_concat)
252+
253+
if not single_dtype:
254+
target_dtype = np.find_common_type(
255+
[arr.dtype for arr in to_concat_no_proxy], []
256+
)
257+
else:
258+
target_dtype = to_concat_no_proxy[0].dtype
259+
to_concat = [
260+
_array_from_proxy(arr, target_dtype) if isinstance(arr, NullArrayProxy) else arr
261+
for arr in to_concat
262+
]
263+
264+
result = np.concatenate(to_concat)
265+
266+
# TODO(ArrayManager) this is currently inconsistent between Series and DataFrame
267+
# so we should decide whether to keep the below special case or remove it
268+
if len(result) == 0:
269+
# all empties -> check for bool to not coerce to float
270+
if len(kinds) != 1:
271+
if "b" in kinds:
272+
result = result.astype(object)
273+
return result
274+
275+
135276
def union_categoricals(
136277
to_union, sort_categories: bool = False, ignore_order: bool = False
137278
):
@@ -322,20 +463,35 @@ def _concat_datetime(to_concat, axis=0):
322463
a single array, preserving the combined dtypes
323464
"""
324465
to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
466+
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
325467

326-
single_dtype = len({x.dtype for x in to_concat}) == 1
468+
single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1
327469

328470
# multiple types, need to coerce to object
329471
if not single_dtype:
330472
# ensure_wrapped_if_datetimelike ensures that astype(object) wraps
331473
# in Timestamp/Timedelta
474+
to_concat = [
475+
_array_from_proxy(arr, dtype=object, fill_value=None)
476+
if isinstance(arr, NullArrayProxy)
477+
else arr
478+
for arr in to_concat
479+
]
480+
332481
return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)
333482

334483
if axis == 1:
335484
# TODO(EA2D): kludge not necessary with 2D EAs
336485
to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat]
486+
else:
487+
to_concat = [
488+
_array_from_proxy(arr, dtype=to_concat_no_proxy[0].dtype)
489+
if isinstance(arr, NullArrayProxy)
490+
else arr
491+
for arr in to_concat
492+
]
337493

338-
result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)
494+
result = type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=axis)
339495

340496
if result.ndim == 2 and is_extension_array_dtype(result.dtype):
341497
# TODO(EA2D): kludge not necessary with 2D EAs

pandas/core/internals/array_manager.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
is_extension_array_dtype,
1919
is_numeric_dtype,
2020
)
21+
from pandas.core.dtypes.concat import NullArrayProxy
2122
from pandas.core.dtypes.dtypes import ExtensionDtype, PandasDtype
2223
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
2324
from pandas.core.dtypes.missing import isna
@@ -725,10 +726,20 @@ def reindex_indexer(
725726
# ignored keywords
726727
consolidate: bool = True,
727728
only_slice: bool = False,
729+
# ArrayManager specific keywords
730+
do_integrity_check=True,
731+
use_na_proxy=False,
728732
) -> T:
729733
axis = self._normalize_axis(axis)
730734
return self._reindex_indexer(
731-
new_axis, indexer, axis, fill_value, allow_dups, copy
735+
new_axis,
736+
indexer,
737+
axis,
738+
fill_value,
739+
allow_dups,
740+
copy,
741+
do_integrity_check,
742+
use_na_proxy,
732743
)
733744

734745
def _reindex_indexer(
@@ -739,6 +750,8 @@ def _reindex_indexer(
739750
fill_value=None,
740751
allow_dups: bool = False,
741752
copy: bool = True,
753+
do_integrity_check=True,
754+
use_na_proxy=False,
742755
) -> T:
743756
"""
744757
Parameters
@@ -773,7 +786,9 @@ def _reindex_indexer(
773786
new_arrays = []
774787
for i in indexer:
775788
if i == -1:
776-
arr = self._make_na_array(fill_value=fill_value)
789+
arr = self._make_na_array(
790+
fill_value=fill_value, use_na_proxy=use_na_proxy
791+
)
777792
else:
778793
arr = self.arrays[i]
779794
new_arrays.append(arr)
@@ -793,7 +808,7 @@ def _reindex_indexer(
793808
new_axes = list(self._axes)
794809
new_axes[axis] = new_axis
795810

796-
return type(self)(new_arrays, new_axes)
811+
return type(self)(new_arrays, new_axes, do_integrity_check=do_integrity_check)
797812

798813
def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True):
799814
"""
@@ -820,10 +835,11 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True
820835
new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
821836
)
822837

823-
def _make_na_array(self, fill_value=None):
838+
def _make_na_array(self, fill_value=None, use_na_proxy=False):
839+
if use_na_proxy:
840+
return NullArrayProxy(self.shape_proper[0])
824841
if fill_value is None:
825842
fill_value = np.nan
826-
827843
dtype, fill_value = infer_dtype_from_scalar(fill_value)
828844
values = np.empty(self.shape_proper[0], dtype=dtype)
829845
values.fill(fill_value)

pandas/core/internals/concat.py

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
is_sparse,
2424
is_timedelta64_dtype,
2525
)
26-
from pandas.core.dtypes.concat import concat_compat
26+
from pandas.core.dtypes.concat import concat_arrays, concat_compat
2727
from pandas.core.dtypes.missing import isna_all
2828

2929
import pandas.core.algorithms as algos
@@ -37,6 +37,45 @@
3737
from pandas.core.arrays.sparse.dtype import SparseDtype
3838

3939

40+
def concatenate_array_managers(
41+
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
42+
) -> Manager:
43+
"""
44+
Concatenate array managers into one.
45+
46+
Parameters
47+
----------
48+
mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
49+
axes : list of Index
50+
concat_axis : int
51+
copy : bool
52+
53+
Returns
54+
-------
55+
ArrayManager
56+
"""
57+
# reindex all arrays
58+
mgrs = []
59+
for mgr, indexers in mgrs_indexers:
60+
for ax, indexer in indexers.items():
61+
mgr = mgr.reindex_indexer(
62+
axes[ax], indexer, axis=ax, do_integrity_check=False, use_na_proxy=True
63+
)
64+
mgrs.append(mgr)
65+
66+
# concatting along the rows -> concat the reindexed arrays
67+
if concat_axis == 1:
68+
arrays = [
69+
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
70+
for j in range(len(mgrs[0].arrays))
71+
]
72+
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
73+
# concatting along the columns -> combine reindexed arrays in a single manager
74+
elif concat_axis == 0:
75+
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
76+
return ArrayManager(arrays, [axes[1], axes[0]], do_integrity_check=False)
77+
78+
4079
def concatenate_block_managers(
4180
mgrs_indexers, axes: List[Index], concat_axis: int, copy: bool
4281
) -> Manager:
@@ -55,19 +94,7 @@ def concatenate_block_managers(
5594
BlockManager
5695
"""
5796
if isinstance(mgrs_indexers[0][0], ArrayManager):
58-
59-
if concat_axis == 1:
60-
# TODO for now only fastpath without indexers
61-
mgrs = [t[0] for t in mgrs_indexers]
62-
arrays = [
63-
concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0)
64-
for j in range(len(mgrs[0].arrays))
65-
]
66-
return ArrayManager(arrays, [axes[1], axes[0]])
67-
elif concat_axis == 0:
68-
mgrs = [t[0] for t in mgrs_indexers]
69-
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
70-
return ArrayManager(arrays, [axes[1], axes[0]])
97+
return concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
7198

7299
concat_plans = [
73100
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers

0 commit comments

Comments
 (0)