|
5 | 5 |
|
6 | 6 | import numpy as np
|
7 | 7 |
|
| 8 | +from pandas._libs import NaT, lib |
8 | 9 | from pandas._typing import ArrayLike, DtypeObj
|
9 | 10 |
|
10 | 11 | from pandas.core.dtypes.cast import find_common_type
|
11 | 12 | from pandas.core.dtypes.common import (
|
| 13 | + is_bool_dtype, |
12 | 14 | is_categorical_dtype,
|
| 15 | + is_datetime64_ns_dtype, |
13 | 16 | is_dtype_equal,
|
14 | 17 | is_extension_array_dtype,
|
| 18 | + is_integer_dtype, |
15 | 19 | is_sparse,
|
| 20 | + is_timedelta64_ns_dtype, |
16 | 21 | )
|
17 | 22 | from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCSeries
|
18 | 23 |
|
|
21 | 26 | from pandas.core.construction import array, ensure_wrapped_if_datetimelike
|
22 | 27 |
|
23 | 28 |
|
| 29 | +class NullArrayProxy: |
| 30 | + """ |
| 31 | + Proxy object for an all-NA array. |
| 32 | +
|
| 33 | + Only stores the length of the array, and not the dtype. The dtype |
| 34 | + will only be known when actually concatenating (after determining the |
| 35 | + common dtype, for which this proxy is ignored). |
| 36 | + Using this object avoids that the internals/concat.py needs to determine |
| 37 | + the proper dtype and array type. |
| 38 | + """ |
| 39 | + |
| 40 | + ndim = 1 |
| 41 | + |
| 42 | + def __init__(self, n: int): |
| 43 | + self.n = n |
| 44 | + |
| 45 | + @property |
| 46 | + def shape(self): |
| 47 | + return (self.n,) |
| 48 | + |
| 49 | + |
| 50 | +def _array_from_proxy(arr, dtype: DtypeObj, fill_value=lib.no_default): |
| 51 | + """ |
| 52 | + Helper function to create the actual all-NA array from the NullArrayProxy object. |
| 53 | +
|
| 54 | + Parameters |
| 55 | + ---------- |
| 56 | + arr : NullArrayProxy |
| 57 | + dtype : the dtype for the resulting array |
| 58 | + fill_value : scalar NA-like value |
| 59 | + By default uses the ExtensionDtype's na_value or np.nan. For numpy |
| 60 | + arrays, this can be overridden to be something else (eg None). |
| 61 | +
|
| 62 | + Returns |
| 63 | + ------- |
| 64 | + np.ndarray or ExtensionArray |
| 65 | + """ |
| 66 | + if is_extension_array_dtype(dtype): |
| 67 | + return dtype.construct_array_type()._from_sequence( |
| 68 | + [dtype.na_value] * arr.n, dtype=dtype |
| 69 | + ) |
| 70 | + elif is_datetime64_ns_dtype(dtype): |
| 71 | + from pandas.core.arrays import DatetimeArray |
| 72 | + |
| 73 | + return DatetimeArray._from_sequence([NaT] * arr.n, dtype=dtype) |
| 74 | + elif is_timedelta64_ns_dtype(dtype): |
| 75 | + from pandas.core.arrays import TimedeltaArray |
| 76 | + |
| 77 | + return TimedeltaArray._from_sequence([NaT] * arr.n, dtype=dtype) |
| 78 | + else: |
| 79 | + if is_integer_dtype(dtype): |
| 80 | + dtype = "float64" |
| 81 | + fill_value = np.nan |
| 82 | + elif is_bool_dtype(dtype): |
| 83 | + dtype = object |
| 84 | + |
| 85 | + if fill_value is lib.no_default: |
| 86 | + fill_value = np.nan |
| 87 | + |
| 88 | + arr = np.empty(arr.n, dtype=dtype) |
| 89 | + arr.fill(fill_value) |
| 90 | + return arr |
| 91 | + |
| 92 | + |
24 | 93 | def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
|
25 | 94 | """
|
26 | 95 | Helper function for `arr.astype(common_dtype)` but handling all special
|
27 | 96 | cases.
|
28 | 97 | """
|
| 98 | + if isinstance(arr, NullArrayProxy): |
| 99 | + return _array_from_proxy(arr, dtype) |
| 100 | + |
29 | 101 | if (
|
30 | 102 | is_categorical_dtype(arr.dtype)
|
31 | 103 | and isinstance(dtype, np.dtype)
|
@@ -132,6 +204,75 @@ def is_nonempty(x) -> bool:
|
132 | 204 | return np.concatenate(to_concat, axis=axis)
|
133 | 205 |
|
134 | 206 |
|
| 207 | +def concat_arrays(to_concat): |
| 208 | + """ |
| 209 | + Alternative for concat_compat but specialized for use in the ArrayManager. |
| 210 | +
|
| 211 | + Differences: only deals with 1D arrays (no axis keyword) and does not skip |
| 212 | + empty arrays to determine the dtype. |
| 213 | + In addition ensures that all NullArrayProxies get replaced with actual |
| 214 | + arrays. |
| 215 | +
|
| 216 | + Parameters |
| 217 | + ---------- |
| 218 | + to_concat : list of arrays |
| 219 | +
|
| 220 | + Returns |
| 221 | + ------- |
| 222 | + np.ndarray or ExtensionArray |
| 223 | + """ |
| 224 | + # ignore the all-NA proxies to determine the resulting dtype |
| 225 | + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] |
| 226 | + |
| 227 | + kinds = {obj.dtype.kind for obj in to_concat_no_proxy} |
| 228 | + single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 |
| 229 | + any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat_no_proxy) |
| 230 | + |
| 231 | + if any_ea: |
| 232 | + if not single_dtype: |
| 233 | + target_dtype = find_common_type([x.dtype for x in to_concat_no_proxy]) |
| 234 | + to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] |
| 235 | + else: |
| 236 | + target_dtype = to_concat_no_proxy[0].dtype |
| 237 | + to_concat = [ |
| 238 | + _array_from_proxy(arr, target_dtype) |
| 239 | + if isinstance(arr, NullArrayProxy) |
| 240 | + else arr |
| 241 | + for arr in to_concat |
| 242 | + ] |
| 243 | + |
| 244 | + if isinstance(to_concat[0], ExtensionArray): |
| 245 | + cls = type(to_concat[0]) |
| 246 | + return cls._concat_same_type(to_concat) |
| 247 | + else: |
| 248 | + return np.concatenate(to_concat) |
| 249 | + |
| 250 | + elif any(kind in ["m", "M"] for kind in kinds): |
| 251 | + return _concat_datetime(to_concat) |
| 252 | + |
| 253 | + if not single_dtype: |
| 254 | + target_dtype = np.find_common_type( |
| 255 | + [arr.dtype for arr in to_concat_no_proxy], [] |
| 256 | + ) |
| 257 | + else: |
| 258 | + target_dtype = to_concat_no_proxy[0].dtype |
| 259 | + to_concat = [ |
| 260 | + _array_from_proxy(arr, target_dtype) if isinstance(arr, NullArrayProxy) else arr |
| 261 | + for arr in to_concat |
| 262 | + ] |
| 263 | + |
| 264 | + result = np.concatenate(to_concat) |
| 265 | + |
| 266 | + # TODO(ArrayManager) this is currently inconsistent between Series and DataFrame |
| 267 | + # so we should decide whether to keep the below special case or remove it |
| 268 | + if len(result) == 0: |
| 269 | + # all empties -> check for bool to not coerce to float |
| 270 | + if len(kinds) != 1: |
| 271 | + if "b" in kinds: |
| 272 | + result = result.astype(object) |
| 273 | + return result |
| 274 | + |
| 275 | + |
135 | 276 | def union_categoricals(
|
136 | 277 | to_union, sort_categories: bool = False, ignore_order: bool = False
|
137 | 278 | ):
|
@@ -322,20 +463,35 @@ def _concat_datetime(to_concat, axis=0):
|
322 | 463 | a single array, preserving the combined dtypes
|
323 | 464 | """
|
324 | 465 | to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
|
| 466 | + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] |
325 | 467 |
|
326 |
| - single_dtype = len({x.dtype for x in to_concat}) == 1 |
| 468 | + single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 |
327 | 469 |
|
328 | 470 | # multiple types, need to coerce to object
|
329 | 471 | if not single_dtype:
|
330 | 472 | # ensure_wrapped_if_datetimelike ensures that astype(object) wraps
|
331 | 473 | # in Timestamp/Timedelta
|
| 474 | + to_concat = [ |
| 475 | + _array_from_proxy(arr, dtype=object, fill_value=None) |
| 476 | + if isinstance(arr, NullArrayProxy) |
| 477 | + else arr |
| 478 | + for arr in to_concat |
| 479 | + ] |
| 480 | + |
332 | 481 | return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)
|
333 | 482 |
|
334 | 483 | if axis == 1:
|
335 | 484 | # TODO(EA2D): kludge not necessary with 2D EAs
|
336 | 485 | to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat]
|
| 486 | + else: |
| 487 | + to_concat = [ |
| 488 | + _array_from_proxy(arr, dtype=to_concat_no_proxy[0].dtype) |
| 489 | + if isinstance(arr, NullArrayProxy) |
| 490 | + else arr |
| 491 | + for arr in to_concat |
| 492 | + ] |
337 | 493 |
|
338 |
| - result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) |
| 494 | + result = type(to_concat_no_proxy[0])._concat_same_type(to_concat, axis=axis) |
339 | 495 |
|
340 | 496 | if result.ndim == 2 and is_extension_array_dtype(result.dtype):
|
341 | 497 | # TODO(EA2D): kludge not necessary with 2D EAs
|
|
0 commit comments