diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5b9e3a7dbad06..f00393e2e1ea1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -169,6 +169,7 @@ Indexing ^^^^^^^^ - Bug in assignment using a reverse slicer (:issue:`26939`) +- Bug in :meth:`DataFrame.explode` would duplicate frame in the presence of duplicates in the index (:issue:`28010`) - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f1ed3a125f60c..d3656f881a886 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6267,12 +6267,13 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": if not self.columns.is_unique: raise ValueError("columns must be unique") - result = self[column].explode() - return ( - self.drop([column], axis=1) - .join(result) - .reindex(columns=self.columns, copy=False) - ) + df = self.reset_index(drop=True) + result = df[column].explode() + result = df.drop([column], axis=1).join(result) + result.index = self.index.take(result.index) + result = result.reindex(columns=self.columns, copy=False) + + return result def unstack(self, level=-1, fill_value=None): """ diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py index b4330aadbfba3..c07de35f8bf34 100644 --- a/pandas/tests/frame/test_explode.py +++ b/pandas/tests/frame/test_explode.py @@ -118,3 +118,47 @@ def test_usecase(): index=[0, 0, 1, 1], ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_dict, input_index, expected_dict, expected_index", + [ + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + [0, 0], + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + [0, 0, 0, 0], + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.Index([0, 0], name="my_index"), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.Index([0, 0, 0, 0], name="my_index"), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"] + ), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], + names=["my_first_index", "my_second_index"], + ), + ), + ( + {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, + pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]), + {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, + pd.MultiIndex.from_arrays( + [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None] + ), + ), + ], +) +def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): + # GH 28005 + df = pd.DataFrame(input_dict, index=input_index) + result = df.explode("col1") + expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py index 331546f7dc73d..e4974bd0af145 100644 --- a/pandas/tests/series/test_explode.py +++ b/pandas/tests/series/test_explode.py @@ -111,3 +111,11 @@ def test_nested_EA(): pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1] ) tm.assert_series_equal(result, expected) + + +def test_duplicate_index(): + # GH 28005 + s = pd.Series([[1, 2], [3, 4]], index=[0, 0]) + result = s.explode() + expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object) + tm.assert_series_equal(result, expected)