From ba493c1213d86471a05119af0616ba00c35a99f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 28 Mar 2020 15:29:13 -0700 Subject: [PATCH 1/3] REF: move 1D concat logic out of SingleBlockManager into concat_compat --- pandas/core/dtypes/concat.py | 15 +++++++++++++-- pandas/core/internals/managers.py | 17 ++++++++--------- pandas/tests/extension/test_external_block.py | 13 +------------ 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 49034616b374a..51bde243a036a 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -114,10 +114,21 @@ def is_nonempty(x) -> bool: elif "sparse" in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) - all_empty = all(not is_nonempty(x) for x in to_concat) - if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1: + non_empties = [x for x in to_concat if is_nonempty(x)] + all_empty = not len(non_empties) + if non_empties and axis == 0: + to_concat = non_empties + + single_dtype = len({x.dtype for x in to_concat}) == 1 + any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) + + if any_ea and axis == 1: to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] + elif any_ea and single_dtype and axis == 0: + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + if all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index dda932cafe73b..30a90db3ae52c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1653,17 +1653,16 @@ def concat(self, to_concat, new_axis: Index) -> "SingleBlockManager": # check if all series are of the same block type: if len(non_empties) > 0: + # Note: doing this here instead of in concat_compat fixes a + # test in test for concating two category-dtype Series + # one of which is empty (and with no categories) blocks = [obj.blocks[0] for obj in non_empties] - if len({b.dtype for b in blocks}) == 1: - new_block = blocks[0].concat_same_type(blocks) - else: - values = [x.values for x in blocks] - values = concat_compat(values) - new_block = make_block(values, placement=slice(0, len(values), 1)) else: - values = [x._block.values for x in to_concat] - values = concat_compat(values) - new_block = make_block(values, placement=slice(0, len(values), 1)) + blocks = [obj.blocks[0] for obj in to_concat] + + values = concat_compat([x.values for x in blocks]) + + new_block = make_block(values, placement=slice(0, len(values), 1)) mgr = SingleBlockManager(new_block, new_axis) return mgr diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 26606d7e799e8..1960e3d09245c 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas.core.internals import BlockManager, SingleBlockManager +from pandas.core.internals import BlockManager from pandas.core.internals.blocks import ExtensionBlock @@ -33,17 +33,6 @@ def df(): return pd.DataFrame(block_manager) -def test_concat_series(): - # GH17728 - values = np.arange(3, dtype="int64") - block = CustomBlock(values, placement=slice(0, 3)) - mgr = SingleBlockManager(block, pd.RangeIndex(3)) - s = pd.Series(mgr, pd.RangeIndex(3), fastpath=True) - - res = pd.concat([s, s]) - assert isinstance(res._data.blocks[0], CustomBlock) - - def test_concat_dataframe(df): # GH17728 res = pd.concat([df, df]) From 0d802ef7c9b3256d112bbd3a463d86bde3ff7b2b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 28 Mar 2020 15:48:21 -0700 Subject: [PATCH 2/3] move more logic into concat_compat --- pandas/core/dtypes/concat.py | 7 +++---- pandas/core/internals/managers.py | 11 +---------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 51bde243a036a..ecfaac2210807 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -97,6 +97,9 @@ def is_nonempty(x) -> bool: # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. + non_empties = [x for x in to_concat if is_nonempty(x)] + if non_empties and axis == 0: + to_concat = non_empties typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) @@ -114,11 +117,7 @@ def is_nonempty(x) -> bool: elif "sparse" in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) - non_empties = [x for x in to_concat if is_nonempty(x)] all_empty = not len(non_empties) - if non_empties and axis == 0: - to_concat = non_empties - single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 30a90db3ae52c..9630abf61f692 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1649,17 +1649,8 @@ def concat(self, to_concat, new_axis: Index) -> "SingleBlockManager": ------- SingleBlockManager """ - non_empties = [x for x in to_concat if len(x) > 0] - - # check if all series are of the same block type: - if len(non_empties) > 0: - # Note: doing this here instead of in concat_compat fixes a - # test in test for concating two category-dtype Series - # one of which is empty (and with no categories) - blocks = [obj.blocks[0] for obj in non_empties] - else: - blocks = [obj.blocks[0] for obj in to_concat] + blocks = [obj.blocks[0] for obj in to_concat] values = concat_compat([x.values for x in blocks]) new_block = make_block(values, placement=slice(0, len(values), 1)) From 39a778884f0bf00a43a9c5dbacab4dc602f8b0f4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 28 Mar 2020 17:36:54 -0700 Subject: [PATCH 3/3] test --- pandas/tests/dtypes/test_concat.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 02daa185b1cdb..1fbbd3356ae13 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -2,7 +2,9 @@ import pandas.core.dtypes.concat as _concat +import pandas as pd from pandas import DatetimeIndex, Period, PeriodIndex, Series, TimedeltaIndex +import pandas._testing as tm @pytest.mark.parametrize( @@ -76,3 +78,13 @@ def test_get_dtype_kinds(index_or_series, to_concat, expected): def test_get_dtype_kinds_period(to_concat, expected): result = _concat.get_dtype_kinds(to_concat) assert result == set(expected) + + +def test_concat_mismatched_categoricals_with_empty(): + # concat_compat behavior on series._values should match pd.concat on series + ser1 = Series(["a", "b", "c"], dtype="category") + ser2 = Series([], dtype="category") + + result = _concat.concat_compat([ser1._values, ser2._values]) + expected = pd.concat([ser1, ser2])._values + tm.assert_categorical_equal(result, expected)