From 37558780071785e1f6d2dd2a4a0ae90c3f88e9e9 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 22 Jul 2021 22:43:00 +0200 Subject: [PATCH 1/2] Bug in concat creating invalid MultiIndex --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/reshape/concat.py | 4 ++-- pandas/tests/reshape/concat/test_dataframe.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ae1844b0a913c..3fc29441c2c3b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -261,7 +261,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) - Sparse diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d908638c4706b..b4d455ba3e1af 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -717,8 +717,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: - new_levels.append(new_index) - new_codes.append(np.tile(np.arange(n), kpieces)) + new_levels.append(new_index.unique()) + new_codes.append(np.tile(new_index.unique().get_indexer(new_index), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 91c246fc9ee2d..460546f4b478a 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -180,3 +180,15 @@ def test_concat_bool_with_int(self): result = concat([df1, df2]) expected = concat([df1.astype("int64"), df2]) tm.assert_frame_equal(result, expected) + + def test_concat_duplicates_in_index_with_keys(self): + # GH#42651 + index = [1, 1, 3] + data = [1, 2, 3] + + df = DataFrame(data=data, index=index) + result = concat([df], keys=["A"], names=["ID", "date"]) + mi = pd.MultiIndex.from_product([["A"], index], names=["ID", "date"]) + expected = DataFrame(data=data, index=mi) + tm.assert_frame_equal(result, expected) + tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date")) From a75ba6f48a632c6d70760200b4fedfdd0e709c81 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 25 Jul 2021 01:42:01 +0200 Subject: [PATCH 2/2] Modify code --- pandas/core/reshape/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index b4d455ba3e1af..560735b593cd1 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -718,7 +718,8 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index.unique()) - new_codes.append(np.tile(new_index.unique().get_indexer(new_index), kpieces)) + single_codes = new_index.unique().get_indexer(new_index) + new_codes.append(np.tile(single_codes, kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names)