From 5636ac0b4edcfc2acb0388bd810864b19b57d3a2 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 23 May 2021 11:09:02 +0100 Subject: [PATCH 1/4] REF: Simplify Index.union --- pandas/core/dtypes/cast.py | 11 ++++++++++- pandas/core/indexes/base.py | 21 +++++++-------------- pandas/tests/indexes/test_setops.py | 26 ++++++++++---------------- 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5c7211a5d1852..eb0ebc7af3e0a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -68,6 +68,7 @@ is_numeric_dtype, is_object_dtype, is_scalar, + is_signed_integer_dtype, is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -1779,13 +1780,14 @@ def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: return dtype -def find_common_type(types: list[DtypeObj]) -> DtypeObj: +def find_common_type(types: list[DtypeObj], *, strict_uint64: bool = False) -> DtypeObj: """ Find a common data type among the given dtypes. Parameters ---------- types : list of dtypes + strict_uint64 : if True, object dtype is returned if uint64 and signed int present. Returns ------- @@ -1831,6 +1833,13 @@ def find_common_type(types: list[DtypeObj]) -> DtypeObj: if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): return np.dtype("object") + # Index.union is special: uint64 & signed int -> object + if strict_uint64: + has_uint64 = any(t == "uint64" for t in types) + has_signed_int = any(is_signed_integer_dtype(t) for t in types) + if has_uint64 and has_signed_int: + return np.dtype("object") + # error: Argument 1 to "find_common_type" has incompatible type # "List[Union[dtype, ExtensionDtype]]"; expected "Sequence[Union[dtype, # None, type, _SupportsDtype, str, Tuple[Any, int], Tuple[Any, Union[int, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 14ec3d6009b61..34e104c537bb1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -77,7 +77,6 @@ is_float_dtype, is_hashable, is_integer, - is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, @@ -2963,19 +2962,13 @@ def union(self, other, sort=None): stacklevel=2, ) - dtype = find_common_type([self.dtype, other.dtype]) - if self._is_numeric_dtype and other._is_numeric_dtype: - # Right now, we treat union(int, float) a bit special. - # See https://github.com/pandas-dev/pandas/issues/26778 for discussion - # We may change union(int, float) to go to object. - # float | [u]int -> float (the special case) - # | -> T - # | -> object - if not (is_integer_dtype(self.dtype) and is_integer_dtype(other.dtype)): - dtype = np.dtype("float64") - else: - # one is int64 other is uint64 - dtype = np.dtype("object") + dtype = find_common_type([self.dtype, other.dtype], strict_uint64=True) + # Right now, we treat union(float, [u]int) a bit special. + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # Now it's: + # * float | [u]int -> float + # * uint64 | signed int -> object + # We may change union(float [u]int) to go to object. left = self.astype(dtype, copy=False) right = other.astype(dtype, copy=False) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 62c07f4306a96..087ccbef7b778 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -8,7 +8,7 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.cast import find_common_type from pandas import ( CategoricalIndex, @@ -25,6 +25,7 @@ import pandas._testing as tm from pandas.api.types import ( is_datetime64tz_dtype, + is_signed_integer_dtype, pandas_dtype, ) @@ -48,7 +49,11 @@ def test_union_different_types(index_flat, index_flat2): idx1 = index_flat idx2 = index_flat2 - type_pair = tuple(sorted([idx1.dtype.type, idx2.dtype.type], key=lambda x: str(x))) + common_dtype = find_common_type([idx1.dtype, idx2.dtype]) + + any_uint64 = idx1.dtype == np.uint64 or idx2.dtype == np.uint64 + idx1_signed = is_signed_integer_dtype(idx1.dtype) + idx2_signed = is_signed_integer_dtype(idx2.dtype) # Union with a non-unique, non-monotonic index raises error # This applies to the boolean index @@ -58,23 +63,12 @@ def test_union_different_types(index_flat, index_flat2): res1 = idx1.union(idx2) res2 = idx2.union(idx1) - if is_dtype_equal(idx1.dtype, idx2.dtype): - assert res1.dtype == idx1.dtype - assert res2.dtype == idx1.dtype - - elif type_pair not in COMPATIBLE_INCONSISTENT_PAIRS: - # A union with a CategoricalIndex (even as dtype('O')) and a - # non-CategoricalIndex can only be made if both indices are monotonic. - # This is true before this PR as well. + if any_uint64 and (idx1_signed or idx2_signed): assert res1.dtype == np.dtype("O") assert res2.dtype == np.dtype("O") - - elif idx1.dtype.kind in ["f", "i", "u"] and idx2.dtype.kind in ["f", "i", "u"]: - assert res1.dtype == np.dtype("f8") - assert res2.dtype == np.dtype("f8") - else: - raise NotImplementedError + assert res1.dtype == common_dtype + assert res2.dtype == common_dtype @pytest.mark.parametrize( From 75823dd6b7db4ab3dad539dc52106ba0089144a3 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 2 Jun 2021 01:24:07 +0100 Subject: [PATCH 2/4] small fix --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 34e104c537bb1..125613c4714f2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2968,7 +2968,7 @@ def union(self, other, sort=None): # Now it's: # * float | [u]int -> float # * uint64 | signed int -> object - # We may change union(float [u]int) to go to object. + # We may change union(float | [u]int) to go to object. left = self.astype(dtype, copy=False) right = other.astype(dtype, copy=False) From b1e876a3cde3e6be7f3580c8293825f5e78d6539 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 2 Jun 2021 09:36:49 +0100 Subject: [PATCH 3/4] Move uint64 check to _find_common_type_compat --- pandas/core/dtypes/cast.py | 11 +---------- pandas/core/indexes/base.py | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index eb0ebc7af3e0a..5c7211a5d1852 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -68,7 +68,6 @@ is_numeric_dtype, is_object_dtype, is_scalar, - is_signed_integer_dtype, is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -1780,14 +1779,13 @@ def ensure_nanosecond_dtype(dtype: DtypeObj) -> DtypeObj: return dtype -def find_common_type(types: list[DtypeObj], *, strict_uint64: bool = False) -> DtypeObj: +def find_common_type(types: list[DtypeObj]) -> DtypeObj: """ Find a common data type among the given dtypes. Parameters ---------- types : list of dtypes - strict_uint64 : if True, object dtype is returned if uint64 and signed int present. Returns ------- @@ -1833,13 +1831,6 @@ def find_common_type(types: list[DtypeObj], *, strict_uint64: bool = False) -> D if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): return np.dtype("object") - # Index.union is special: uint64 & signed int -> object - if strict_uint64: - has_uint64 = any(t == "uint64" for t in types) - has_signed_int = any(is_signed_integer_dtype(t) for t in types) - if has_uint64 and has_signed_int: - return np.dtype("object") - # error: Argument 1 to "find_common_type" has incompatible type # "List[Union[dtype, ExtensionDtype]]"; expected "Sequence[Union[dtype, # None, type, _SupportsDtype, str, Tuple[Any, int], Tuple[Any, Union[int, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 125613c4714f2..ad42338b22611 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2962,14 +2962,7 @@ def union(self, other, sort=None): stacklevel=2, ) - dtype = find_common_type([self.dtype, other.dtype], strict_uint64=True) - # Right now, we treat union(float, [u]int) a bit special. - # See https://github.com/pandas-dev/pandas/issues/26778 for discussion - # Now it's: - # * float | [u]int -> float - # * uint64 | signed int -> object - # We may change union(float | [u]int) to go to object. - + dtype = self._find_common_type_compat(other) left = self.astype(dtype, copy=False) right = other.astype(dtype, copy=False) return left.union(right, sort=sort) @@ -5403,6 +5396,19 @@ def _find_common_type_compat(self, target) -> DtypeObj: return IntervalDtype(np.float64, closed=self.closed) target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) + + # special case: if either is uint64 and other dtype is signed int, return object + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # Now it's: + # * float | [u]int -> float + # * uint64 | signed int -> object + # We may change union(float | [u]int) to go to object. + if self.dtype == "uint64" or target_dtype == "uint64": + if is_signed_integer_dtype(self.dtype) or is_signed_integer_dtype( + target_dtype + ): + return np.dtype("object") + dtype = find_common_type([self.dtype, target_dtype]) if dtype.kind in ["i", "u"]: # TODO: what about reversed with self being categorical? From 740bb751505c32f889ab5c31ebd6a1b9e16033d1 Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 2 Jun 2021 12:41:15 +0100 Subject: [PATCH 4/4] small fix --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ad42338b22611..124903446220d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5397,7 +5397,7 @@ def _find_common_type_compat(self, target) -> DtypeObj: target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) - # special case: if either is uint64 and other dtype is signed int, return object + # special case: if one dtype is uint64 and the other a signed int, return object # See https://github.com/pandas-dev/pandas/issues/26778 for discussion # Now it's: # * float | [u]int -> float