From d28c39c518f387383c1b166f15627c9b4025024f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 3 Mar 2023 20:27:38 -0500 Subject: [PATCH 1/6] Series.combine_first to preserve dtype --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/series.py | 10 +++++++++- pandas/tests/series/methods/test_combine_first.py | 8 ++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2ac20e97409e7..b78c9e889a54e 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -193,7 +193,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` (:issue:`51764`) - Sparse diff --git a/pandas/core/series.py b/pandas/core/series.py index 03d7b25aca49a..b223ff33a4b27 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -93,11 +93,13 @@ from pandas.core.dtypes.cast import ( LossySetitemError, convert_dtypes, + find_common_type, maybe_box_native, maybe_cast_pointwise_result, ) from pandas.core.dtypes.common import ( is_dict_like, + is_dtype_equal, is_extension_array_dtype, is_integer, is_iterator, @@ -3272,7 +3274,13 @@ def combine_first(self, other) -> Series: if this.dtype.kind == "M" and other.dtype.kind != "M": other = to_datetime(other) - return this.where(notna(this), other) + combined = this.where(notna(this), other) + + if not is_dtype_equal(combined.dtype, self.dtype): + dtype = find_common_type([self.dtype, other.dtype]) + combined = combined.astype(dtype, copy=False) + + return combined def update(self, other: Series | Sequence | Mapping) -> None: """ diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index aac3d4986e8ee..fa35b37773072 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -112,3 +112,11 @@ def test_combine_first_timezone_series_with_empty_series(self): s2 = Series(index=time_index) result = s1.combine_first(s2) tm.assert_series_equal(result, s1) + + def test_combine_first_preserves_dtype(self): + # GH51764 + s1 = Series([4, 5]) + s2 = Series([6, 7, 8]) + result = s1.combine_first(s2) + expected = Series([4, 5, 8]) + tm.assert_series_equal(result, expected) From f60a3d6ad4eeeff2f57a4c627503e4334e257b03 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 4 Mar 2023 09:51:45 -0500 Subject: [PATCH 2/6] avoid loss of precision --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/series.py | 24 ++++++++++--------- .../series/methods/test_combine_first.py | 6 ++--- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b78c9e889a54e..be669fc80e731 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -193,7 +193,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` (:issue:`51764`) +- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`) - Sparse diff --git a/pandas/core/series.py b/pandas/core/series.py index b223ff33a4b27..399120c707397 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -93,13 +93,11 @@ from pandas.core.dtypes.cast import ( LossySetitemError, convert_dtypes, - find_common_type, maybe_box_native, maybe_cast_pointwise_result, ) from pandas.core.dtypes.common import ( is_dict_like, - is_dtype_equal, is_extension_array_dtype, is_integer, is_iterator, @@ -3268,18 +3266,22 @@ def combine_first(self, other) -> Series: falcon NaN dtype: float64 """ - new_index = self.index.union(other.index) - this = self.reindex(new_index, copy=False) - other = other.reindex(new_index, copy=False) - if this.dtype.kind == "M" and other.dtype.kind != "M": - other = to_datetime(other) + from pandas.core.reshape.concat import concat - combined = this.where(notna(this), other) + new_index = self.index.union(other.index) - if not is_dtype_equal(combined.dtype, self.dtype): - dtype = find_common_type([self.dtype, other.dtype]) - combined = combined.astype(dtype, copy=False) + this = self + null_mask = isna(this) + if null_mask.any(): + drop = this.index[null_mask].intersection(other.index[notna(other)]) + if len(drop): + this = this.drop(drop) + other = other.reindex(other.index.difference(this.index), copy=False) + if this.dtype.kind == "M" and other.dtype.kind != "M": + other = to_datetime(other) + combined = concat([this, other]).reindex(new_index, copy=False) + combined.name = self.name return combined def update(self, other: Series | Sequence | Mapping) -> None: diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index fa35b37773072..81d767426c72d 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -115,8 +115,8 @@ def test_combine_first_timezone_series_with_empty_series(self): def test_combine_first_preserves_dtype(self): # GH51764 - s1 = Series([4, 5]) - s2 = Series([6, 7, 8]) + s1 = Series([np.iinfo(np.int64).min, np.iinfo(np.int64).max]) + s2 = Series([1, 2, 3]) result = s1.combine_first(s2) - expected = Series([4, 5, 8]) + expected = Series([np.iinfo(np.int64).min, np.iinfo(np.int64).max, 3]) tm.assert_series_equal(result, expected) From e10b62b8bded77224ad1589ffe9b83255b8970dc Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 4 Mar 2023 10:06:33 -0500 Subject: [PATCH 3/6] whatsnew --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index be669fc80e731..f2eb3b13dbf96 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -108,6 +108,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.where` when ``cond`` is backed by an extension dtype (:issue:`51574`) - Performance improvement in :meth:`read_orc` when reading a remote URI file path. (:issue:`51609`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.isna` when array has zero nulls or is all nulls (:issue:`51630`) +- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: From d471b61ffd35e530979d42cb93525a0666cc05cb Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 4 Mar 2023 10:38:35 -0500 Subject: [PATCH 4/6] fix --- pandas/core/series.py | 6 +++--- pandas/tests/extension/test_sparse.py | 7 ------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 399120c707397..14d4148dce34f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3280,9 +3280,9 @@ def combine_first(self, other) -> Series: other = other.reindex(other.index.difference(this.index), copy=False) if this.dtype.kind == "M" and other.dtype.kind != "M": other = to_datetime(other) - combined = concat([this, other]).reindex(new_index, copy=False) - combined.name = self.name - return combined + combined = concat([this, other]) + combined = combined.reindex(new_index, copy=False) + return combined.__finalize__(self, method="combine_first") def update(self, other: Series | Sequence | Mapping) -> None: """ diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 2aeb2af567ea0..836e644affbda 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -327,13 +327,6 @@ def test_where_series(self, data, na_value): self.assert_series_equal(result, expected) def test_combine_first(self, data, request): - if data.dtype.subtype == "int": - # Right now this is upcasted to float, just like combine_first - # for Series[int] - mark = pytest.mark.xfail( - reason="TODO(SparseArray.__setitem__) will preserve dtype." - ) - request.node.add_marker(mark) super().test_combine_first(data) def test_searchsorted(self, data_for_sorting, as_series): From b2e9a888fbfe38f80be705ed87eff07017d183f2 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 7 Mar 2023 19:19:16 -0500 Subject: [PATCH 5/6] simplify --- pandas/core/series.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 441eabf8bb703..f5cde055e792d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3269,13 +3269,13 @@ def combine_first(self, other) -> Series: new_index = self.index.union(other.index) this = self - null_mask = isna(this) - if null_mask.any(): - drop = this.index[null_mask].intersection(other.index[notna(other)]) - if len(drop): - this = this.drop(drop) + # identify the index subset to keep for each series + keep_other = other.index.difference(this.index[notna(this)]) + keep_this = this.index.difference(keep_other) + + this = this.reindex(keep_this, copy=False) + other = other.reindex(keep_other, copy=False) - other = other.reindex(other.index.difference(this.index), copy=False) if this.dtype.kind == "M" and other.dtype.kind != "M": other = to_datetime(other) combined = concat([this, other]) From 3e60a3682d4e9b2c67bd8f8ae7bd3e9bf5c57607 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 9 Mar 2023 06:44:42 -0500 Subject: [PATCH 6/6] update test --- pandas/tests/series/methods/test_combine_first.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 81d767426c72d..25a3dfe42e843 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -115,8 +115,8 @@ def test_combine_first_timezone_series_with_empty_series(self): def test_combine_first_preserves_dtype(self): # GH51764 - s1 = Series([np.iinfo(np.int64).min, np.iinfo(np.int64).max]) + s1 = Series([1666880195890293744, 1666880195890293837]) s2 = Series([1, 2, 3]) result = s1.combine_first(s2) - expected = Series([np.iinfo(np.int64).min, np.iinfo(np.int64).max, 3]) + expected = Series([1666880195890293744, 1666880195890293837, 3]) tm.assert_series_equal(result, expected)