From 361c064c46ffdee486e5affb5937a9a1ece4e982 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 15 Sep 2021 21:38:36 -0400 Subject: [PATCH 1/5] BUG: concat fails if keys aren't unique --- pandas/core/indexes/base.py | 3 ++- pandas/core/reshape/concat.py | 2 +- pandas/tests/indexes/test_indexing.py | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6887b919cc7d6..c26b6448d0122 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3533,13 +3533,14 @@ def get_indexer( method: str_t | None = None, limit: int | None = None, tolerance=None, + enforce_unique=True, ) -> npt.NDArray[np.intp]: method = missing.clean_reindex_fill_method(method) target = self._maybe_cast_listlike_indexer(target) self._check_indexing_method(method, limit, tolerance) - if not self._index_as_unique: + if enforce_unique and not self._index_as_unique: raise InvalidIndexError(self._requires_unique_msg) if len(target) == 0: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index fa6263b70c101..65cd63cec6fca 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -755,7 +755,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) - mapped = level.get_indexer(hlevel) + mapped = level.get_indexer(hlevel, enforce_unique=False) mask = mapped == -1 if mask.any(): diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index ff2cd76ab6377..c2d4e6a266a31 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -210,6 +210,8 @@ def test_get_loc_generator(self, index): class TestGetIndexer: def test_get_indexer_base(self, index): + print(index) + if index._index_as_unique: expected = np.arange(index.size, dtype=np.intp) actual = index.get_indexer(index) From 47abc872410d18b8a7fbd5002e3e4f9d13fad50c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 15 Sep 2021 22:43:53 -0400 Subject: [PATCH 2/5] BUG: concat raises when all indexes are the same and keys are duplicated --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/indexes/base.py | 3 +-- pandas/core/reshape/concat.py | 4 ++-- pandas/tests/indexes/test_indexing.py | 2 -- pandas/tests/reshape/concat/test_concat.py | 13 +++++++++++++ 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index eb1bc270a7d1b..ff143e59ec597 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -486,6 +486,7 @@ Reshaping - Bug in :meth:`DataFrame.append` failing to retain dtypes when appended columns do not match (:issue:`43392`) - Bug in :func:`concat` of ``bool`` and ``boolean`` dtypes resulting in ``object`` dtype instead of ``boolean`` dtype (:issue:`42800`) - Bug in :func:`crosstab` when inputs are are categorical Series, there are categories that are not present in one or both of the Series, and ``margins=True``. Previously the margin value for missing categories was ``NaN``. It is now correctly reported as 0 (:issue:`43505`) +- Bug in :func:`concat` would fail when the ``objs`` argument all had the same index and the ``keys`` argument contained duplicates (:issue: Sparse ^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c26b6448d0122..6887b919cc7d6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3533,14 +3533,13 @@ def get_indexer( method: str_t | None = None, limit: int | None = None, tolerance=None, - enforce_unique=True, ) -> npt.NDArray[np.intp]: method = missing.clean_reindex_fill_method(method) target = self._maybe_cast_listlike_indexer(target) self._check_indexing_method(method, limit, tolerance) - if enforce_unique and not self._index_as_unique: + if not self._index_as_unique: raise InvalidIndexError(self._requires_unique_msg) if len(target) == 0: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 65cd63cec6fca..d965a9ca5d685 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -695,7 +695,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde else: levels = [ensure_index(x) for x in levels] - if not all_indexes_same(indexes): + if not all_indexes_same(indexes) or not all(level.is_unique for level in levels): codes_list = [] # things are potentially different sizes, so compute the exact codes @@ -755,7 +755,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) - mapped = level.get_indexer(hlevel, enforce_unique=False) + mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index c2d4e6a266a31..ff2cd76ab6377 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -210,8 +210,6 @@ def test_get_loc_generator(self, index): class TestGetIndexer: def test_get_indexer_base(self, index): - print(index) - if index._index_as_unique: expected = np.arange(index.size, dtype=np.intp) actual = index.get_indexer(index) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 7c78ff5a71de3..871f4c5ce2106 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -560,6 +560,19 @@ def test_duplicate_keys(keys): tm.assert_frame_equal(result, expected) +def test_duplicate_keys_same_frame(): + # GH 33654 + keys = ["e", "e"] + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = concat([df, df], axis=1, keys=keys) + expected_values = [[1, 4, 1, 4], [2, 5, 2, 5], [3, 6, 3, 6]] + expected_columns = MultiIndex.from_tuples( + [(keys[0], "a"), (keys[0], "b"), (keys[1], "a"), (keys[1], "b")] + ) + expected = DataFrame(expected_values, columns=expected_columns) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "obj", [ From e66bdb7e262adb9015f58f9442872e288d8a45b9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 15 Sep 2021 22:45:18 -0400 Subject: [PATCH 3/5] issue # --- pandas/tests/reshape/concat/test_concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 871f4c5ce2106..cbec23769f87f 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -561,7 +561,7 @@ def test_duplicate_keys(keys): def test_duplicate_keys_same_frame(): - # GH 33654 + # GH 43595 keys = ["e", "e"] df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) result = concat([df, df], axis=1, keys=keys) From 02d2a4136dc595a2ca88ee724272c04ccfcf8274 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 15 Sep 2021 22:51:40 -0400 Subject: [PATCH 4/5] doc fixup --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ff143e59ec597..93df734589588 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -486,7 +486,7 @@ Reshaping - Bug in :meth:`DataFrame.append` failing to retain dtypes when appended columns do not match (:issue:`43392`) - Bug in :func:`concat` of ``bool`` and ``boolean`` dtypes resulting in ``object`` dtype instead of ``boolean`` dtype (:issue:`42800`) - Bug in :func:`crosstab` when inputs are are categorical Series, there are categories that are not present in one or both of the Series, and ``margins=True``. Previously the margin value for missing categories was ``NaN``. It is now correctly reported as 0 (:issue:`43505`) -- Bug in :func:`concat` would fail when the ``objs`` argument all had the same index and the ``keys`` argument contained duplicates (:issue: +- Bug in :func:`concat` would fail when the ``objs`` argument all had the same index and the ``keys`` argument contained duplicates (:issue:`43595`) Sparse ^^^^^^ From 5b860141beff0426f18b559b8d5980359759ff6a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 19 Sep 2021 22:50:43 -0400 Subject: [PATCH 5/5] catch warning --- pandas/tests/reshape/concat/test_concat.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index cbec23769f87f..676571e419a1a 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -3,11 +3,16 @@ deque, ) from decimal import Decimal -from warnings import catch_warnings +from warnings import ( + catch_warnings, + simplefilter, +) import numpy as np import pytest +from pandas.errors import PerformanceWarning + import pandas as pd from pandas import ( DataFrame, @@ -570,7 +575,10 @@ def test_duplicate_keys_same_frame(): [(keys[0], "a"), (keys[0], "b"), (keys[1], "a"), (keys[1], "b")] ) expected = DataFrame(expected_values, columns=expected_columns) - tm.assert_frame_equal(result, expected) + with catch_warnings(): + # result.columns not sorted, resulting in performance warning + simplefilter("ignore", PerformanceWarning) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize(