From e130060b381ee836f47b06d21bcc6101a86814fc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 May 2020 09:00:32 +0200 Subject: [PATCH 1/8] BUG: Fix concat of frames with extension types (no reindexed columns) --- pandas/core/arrays/integer.py | 8 ++++++-- pandas/core/dtypes/concat.py | 2 +- pandas/core/internals/concat.py | 9 +++++++++ pandas/tests/reshape/test_concat.py | 14 ++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 59954f548fd33..4eec3bda11b39 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -94,10 +94,14 @@ def construct_array_type(cls) -> Type["IntegerArray"]: def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other integer types - if not all(isinstance(t, _IntegerDtype) for t in dtypes): + if not all( + isinstance(t, _IntegerDtype) + or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer)) + for t in dtypes + ): return None np_dtype = np.find_common_type( - [t.numpy_dtype for t in dtypes], [] # type: ignore + [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] # type: ignore ) if np.issubdtype(np_dtype, np.integer): return _dtypes[str(np_dtype)] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ca3a41813f3d3..eb4ae832a07de 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -138,7 +138,7 @@ def is_nonempty(x) -> bool: single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) - if any_ea and axis == 0: + if any_ea: if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index e25c4c2341217..4138251b39344 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -319,6 +319,15 @@ def _concatenate_join_units(join_units, concat_axis, copy): concat_values = concat_values.copy() else: concat_values = concat_values.copy() + elif any(isinstance(t, ExtensionArray) for t in to_concat): + # concatting with at least one EA means we are concatting a single column + # the non-EA values are 2D arrays with shape (1, n) + to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] + concat_values = concat_compat(to_concat, axis=concat_axis) + if not isinstance(concat_values, ExtensionArray): + # if the result of concat is not an EA but an ndarray, reshape to + # 2D to put it a non-EA Block + concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 8d6d93df6f776..9ba85cb48b476 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2832,3 +2832,17 @@ def test_concat_preserves_subclass(obj): result = concat([obj, obj]) assert isinstance(result, type(obj)) + + +def test_concat_frame_axis0_extension_dtypes(): + # preserve extension dtype (through common_dtype mechanism) + df1 = pd.DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")}) + df2 = pd.DataFrame({"a": np.array([4, 5, 6])}) + + result = pd.concat([df1, df2], ignore_index=True) + expected = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + result = pd.concat([df2, df1], ignore_index=True) + expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") + tm.assert_frame_equal(result, expected) From f83338a2b2204a47cebe7972c37e5ad63069adec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 May 2020 09:25:20 +0200 Subject: [PATCH 2/8] clean-up --- pandas/core/dtypes/concat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index eb4ae832a07de..6274905afaadc 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -152,10 +152,6 @@ def is_nonempty(x) -> bool: elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) - elif any_ea and axis == 1: - to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] - return np.concatenate(to_concat, axis=axis) - elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise From 58226948ec6550dc021c1b06a81f8157e7c8d696 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 May 2020 13:48:21 +0200 Subject: [PATCH 3/8] fix concat of DatetimeBlock / DatetimeTZBlock to not take 'uniform' path --- pandas/core/internals/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4138251b39344..fd8c5f5e27c02 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -452,7 +452,7 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: # cannot necessarily join return ( # all blocks need to have the same type - all(isinstance(ju.block, type(join_units[0].block)) for ju in join_units) + all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. From 3d32bc813d498349f6c7490281a86d175f36f33e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 23 May 2020 13:48:50 +0200 Subject: [PATCH 4/8] fix base extension tests --- pandas/tests/extension/base/reshaping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c9445ceec2c77..cd932e842e00c 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -62,11 +62,11 @@ def test_concat_mixed_dtypes(self, data): self.assert_series_equal(result, expected) # simple test for just EA and one other - result = pd.concat([df1, df2]) + result = pd.concat([df1, df2.astype(object)]) expected = pd.concat([df1.astype("object"), df2.astype("object")]) self.assert_frame_equal(result, expected) - result = pd.concat([df1["A"], df2["A"]]) + result = pd.concat([df1["A"], df2["A"].astype(object)]) expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) self.assert_series_equal(result, expected) From d269229486bd6748b5c4e2f6e8dd67de39ee3e4d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 6 Jun 2020 09:55:46 +0200 Subject: [PATCH 5/8] fix test --- pandas/core/arrays/integer.py | 2 +- pandas/tests/indexing/test_indexing.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3dc1849fc586a..df43b5d6115ba 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -100,7 +100,7 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: ): return None np_dtype = np.find_common_type( - [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] # type: ignore + [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): return _dtypes[str(np_dtype)] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 51a7aa9bb586b..66b61bda74ef0 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1006,12 +1006,24 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): + # all numeric columns -> numeric series df = pd.DataFrame( - {"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])}, + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"], + ) + result = df.loc["a"] + expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a") + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + # mixed columns -> object series + df = pd.DataFrame( + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])}, index=["a", "b"], ) result = df.loc["a"] - expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a") + expected = pd.Series([1, "a"], dtype=object, index=["A", "B"], name="a") tm.assert_series_equal(result, expected) result = df.iloc[0] From 63a6570462c1826763e6dc13ab6f21e30c426d2e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 6 Jun 2020 10:56:11 +0200 Subject: [PATCH 6/8] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2243790a663df..954ccd4752697 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -291,6 +291,7 @@ Other enhancements - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). +- :func:`concat` and `~DataFrame.append` now preserve extension dtypes (:issue:`33607`, :issue:`34339`). .. --------------------------------------------------------------------------- From d8faef53dbd6320d75d5db217d1481862a25119b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Jun 2020 09:02:45 +0200 Subject: [PATCH 7/8] update whatsnew --- doc/source/whatsnew/v1.1.0.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b486c6b242dcf..108b8c01e8e36 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -293,7 +293,9 @@ Other enhancements - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). -- :func:`concat` and `~DataFrame.append` now preserve extension dtypes (:issue:`33607`, :issue:`34339`). +- :func:`concat` and `~DataFrame.append` now preserve extension dtypes, for example + combining a nullable integer column with a numpy integer column will no longer + result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). .. --------------------------------------------------------------------------- From 2e1f1a6bb7c0d265843dc8034fa56b7838c09f52 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Jun 2020 08:51:53 +0200 Subject: [PATCH 8/8] Update doc/source/whatsnew/v1.1.0.rst Co-authored-by: Tom Augspurger --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 108b8c01e8e36..d8d6ef6fb33e8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -293,7 +293,7 @@ Other enhancements - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). -- :func:`concat` and `~DataFrame.append` now preserve extension dtypes, for example +- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).