-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: general concat with ExtensionArrays through find_common_type #33607
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
3464e95
b1d9d68
bb398e7
83fdc91
7f2ac2a
d0f90de
2d5fcb0
a68206b
fc98b65
b072591
91c984a
2a2b9d5
8893165
e19e3ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2349,9 +2349,9 @@ def _can_hold_na(self): | |
|
||
@classmethod | ||
def _concat_same_type(self, to_concat): | ||
from pandas.core.dtypes.concat import concat_categorical | ||
from pandas.core.dtypes.concat import union_categoricals | ||
|
||
return concat_categorical(to_concat) | ||
return union_categoricals(to_concat) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC only a relatively small part of the logic of concat_categorical/union_categoricals is actually needed here. I'd prefer for that to live here and for union_categoricals to call it, rather than the other way around (since union_categoricals handles a lot of cases). Could be considered orthogonally to this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, that's indeed a good idea (union_categoricals does way much more that is all not needed here) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you planning to update this, or is that a topic for a separate PR? |
||
|
||
def isin(self, values): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
import numbers | ||
from typing import TYPE_CHECKING, Tuple, Type, Union | ||
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib, missing as libmissing | ||
from pandas._typing import ArrayLike | ||
from pandas._typing import ArrayLike, DtypeObj | ||
from pandas.compat import set_function_name | ||
from pandas.util._decorators import cache_readonly | ||
|
||
|
@@ -95,6 +95,15 @@ def construct_array_type(cls) -> Type["IntegerArray"]: | |
""" | ||
return IntegerArray | ||
|
||
def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be common_type or common_dtype? we've been loose about this distinction so far and i think it has caused amibiguity There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't care that much. I mainly used "type", because it is meant to be used in (that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed to "common_dtype" instead of "common_type". The internal function that uses this is still There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for indulging me on this nitpick There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this a private method on the Dtype? get_common_type (or get_common_dtype) seems fine |
||
# for now only handle other integer types | ||
if not all(isinstance(t, _IntegerDtype) for t in dtypes): | ||
return None | ||
np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], []) | ||
if np.issubdtype(np_dtype, np.integer): | ||
return _dtypes[str(np_dtype)] | ||
return None | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __from_arrow__( | ||
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] | ||
) -> "IntegerArray": | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._typing import DtypeObj | ||
from pandas.errors import AbstractMethodError | ||
|
||
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries | ||
|
@@ -322,3 +323,33 @@ def _is_boolean(self) -> bool: | |
bool | ||
""" | ||
return False | ||
|
||
def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mmm can we keep the return type as Oh... I suppose tz-naive DatetimeArray might break this, since it wants to return a NumPy dtype... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that was my first thought as well. But, right now, eg Categorical can end up with any kind of numpy dtype (depending on the dtype of its categories). As long as not yet all dtypes have a EA version, I don't think it is feasible to require ExtensionDtype here |
||
""" | ||
Return the common dtype, if one exists. | ||
|
||
Used in `find_common_type` implementation. This is for example used | ||
to determine the resulting dtype in a concat operation. | ||
|
||
If no common dtype exists, return None. If all dtypes in the list | ||
will return None, then the common dtype will be "object" dtype. | ||
|
||
Parameters | ||
---------- | ||
dtypes : list of dtypes | ||
The dtypes for which to determine a common dtype. This is a list | ||
of np.dtype or ExtensionDtype instances. | ||
|
||
Returns | ||
------- | ||
Common dtype (np.dtype or ExtensionDtype) or None | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
# QUESTIONS: | ||
# - do we guarantee that `dtypes` is already deduplicated? (list of uniques) | ||
# - do we call this method if `len(dtypes) == 1`, or does this method | ||
# need to handle that case | ||
if len(set(dtypes)) == 1: | ||
# only itself | ||
return self | ||
else: | ||
return None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
import numpy as np | ||
|
||
from pandas.core.dtypes.cast import find_common_type | ||
from pandas.core.dtypes.common import ( | ||
is_bool_dtype, | ||
is_categorical_dtype, | ||
|
@@ -17,6 +18,9 @@ | |
) | ||
from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries | ||
|
||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.construction import array | ||
|
||
|
||
def get_dtype_kinds(l): | ||
""" | ||
|
@@ -99,14 +103,36 @@ def is_nonempty(x) -> bool: | |
single_dtype = len({x.dtype for x in to_concat}) == 1 | ||
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) | ||
|
||
if any_ea and single_dtype and axis == 0: | ||
cls = type(to_concat[0]) | ||
return cls._concat_same_type(to_concat) | ||
|
||
elif "category" in typs: | ||
# this must be prior to concat_datetime, | ||
# to support Categorical + datetime-like | ||
return concat_categorical(to_concat, axis=axis) | ||
if any_ea and axis == 0: | ||
if not single_dtype: | ||
target_dtype = find_common_type([x.dtype for x in to_concat]) | ||
|
||
def cast(arr, dtype): | ||
if ( | ||
is_categorical_dtype(arr.dtype) | ||
and isinstance(dtype, np.dtype) | ||
and np.issubdtype(dtype, np.integer) | ||
): | ||
# problem case: categorical of int -> gives int as result dtype, | ||
# but categorical can contain NAs -> fall back to object dtype | ||
try: | ||
return arr.astype(dtype, copy=False) | ||
except ValueError: | ||
return arr.astype(object, copy=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This complication is to try to preserve some of the value-dependent behaviour of Categorical (in case of integer categories: missing values present or not?) Eg when concatting integer categorical with integer series:
Currently, when concatting, a Categorical with integer categories gets converted to int numpy array if no missing values are present, but object numpy array if missing values are present (to preserve the integers) |
||
|
||
if is_extension_array_dtype(dtype): | ||
if isinstance(arr, np.ndarray): | ||
# numpy's astype cannot handle ExtensionDtypes | ||
return array(arr, dtype=dtype, copy=False) | ||
return arr.astype(dtype, copy=False) | ||
|
||
to_concat = [cast(arr, target_dtype) for arr in to_concat] | ||
|
||
if isinstance(to_concat[0], ExtensionArray): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cls = type(to_concat[0]) | ||
return cls._concat_same_type(to_concat) | ||
else: | ||
return np.concatenate(to_concat) | ||
|
||
elif _contains_datetime or "timedelta" in typs or _contains_period: | ||
return concat_datetime(to_concat, axis=axis, typs=typs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we do the DTA/TDA casting above, and do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, because they are not using ExtensionDtype. |
||
|
@@ -136,52 +162,6 @@ def is_nonempty(x) -> bool: | |
return np.concatenate(to_concat, axis=axis) | ||
|
||
|
||
def concat_categorical(to_concat, axis: int = 0): | ||
""" | ||
Concatenate an object/categorical array of arrays, each of which is a | ||
single dtype | ||
|
||
Parameters | ||
---------- | ||
to_concat : array of arrays | ||
axis : int | ||
Axis to provide concatenation in the current implementation this is | ||
always 0, e.g. we only have 1D categoricals | ||
|
||
Returns | ||
------- | ||
Categorical | ||
A single array, preserving the combined dtypes | ||
""" | ||
# we could have object blocks and categoricals here | ||
# if we only have a single categoricals then combine everything | ||
# else its a non-compat categorical | ||
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] | ||
|
||
# validate the categories | ||
if len(categoricals) != len(to_concat): | ||
pass | ||
else: | ||
# when all categories are identical | ||
first = to_concat[0] | ||
if all(first.is_dtype_equal(other) for other in to_concat[1:]): | ||
return union_categoricals(categoricals) | ||
|
||
# extract the categoricals & coerce to object if needed | ||
to_concat = [ | ||
x._internal_get_values() | ||
if is_categorical_dtype(x.dtype) | ||
else np.asarray(x).ravel() | ||
if not is_datetime64tz_dtype(x) | ||
else np.asarray(x.astype(object)) | ||
for x in to_concat | ||
] | ||
result = concat_compat(to_concat) | ||
if axis == 1: | ||
result = result.reshape(1, len(result)) | ||
return result | ||
|
||
|
||
def union_categoricals( | ||
to_union, sort_categories: bool = False, ignore_order: bool = False | ||
): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import pytest | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"to_concat_dtypes, result_dtype", | ||
[ | ||
(["Int64", "Int64"], "Int64"), | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
(["UInt64", "UInt64"], "UInt64"), | ||
(["Int8", "Int8"], "Int8"), | ||
(["Int8", "Int16"], "Int16"), | ||
(["UInt8", "Int8"], "Int16"), | ||
(["Int32", "UInt32"], "Int64"), | ||
# this still gives object (awaiting float extension dtype) | ||
(["Int64", "UInt64"], "object"), | ||
], | ||
) | ||
def test_concat_series(to_concat_dtypes, result_dtype): | ||
|
||
result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) | ||
expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( | ||
result_dtype | ||
) | ||
tm.assert_series_equal(result, expected) |
Uh oh!
There was an error while loading. Please reload this page.