Avoid duplicating entire exploded column when joining back with original frame

Marco Gorelli · Marco Gorelli · commit 986fbbaac732 · 2019-08-19T13:43:07.000+01:00
diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst
@@ -84,7 +84,7 @@ Indexing
 - Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`)
 - Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`)
 - Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`).
--
+- When using :meth:`DataFrame.explode`, don't duplicate entire exploded column when joining back with original frame (:issue:`28005`).
 
 Missing
 ^^^^^^^
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6240,8 +6240,10 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame":
         result = self[column].explode()
         return (
             self.drop([column], axis=1)
-            .join(result)
+            .reset_index(drop=True)  # GH 28005
+            .join(self[column].reset_index(drop=True).explode())
             .reindex(columns=self.columns, copy=False)
+            .set_index(result.index)  # GH 28005
         )
 
     def unstack(self, level=-1, fill_value=None):
diff --git a/pandas/tests/frame/test_explode.py b/pandas/tests/frame/test_explode.py
@@ -118,3 +118,28 @@ def test_usecase():
         index=[0, 0, 1, 1],
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "df, expected",
+    [
+        (
+            pd.DataFrame({"col": [[1, 2], [3, 4]]}, index=[0, 0]),
+            pd.DataFrame({"col": [1, 2, 3, 4]}, index=[0, 0, 0, 0], dtype=object),
+        ),
+        (
+            pd.DataFrame(
+                {"col": [[1, 2], [3, 4]], "other_col": ["a", "b"]}, index=[0, 0]
+            ),
+            pd.DataFrame(
+                {"col": [1, 2, 3, 4], "other_col": ["a", "a", "b", "b"]},
+                index=[0, 0, 0, 0],
+                dtype=object,
+            ),
+        ),
+    ],
+)
+def test_duplicate_index(df, expected):
+    # GH 28005
+    result = df.explode("col")
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/test_explode.py b/pandas/tests/series/test_explode.py
@@ -111,3 +111,11 @@ def test_nested_EA():
         pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1]
     )
     tm.assert_series_equal(result, expected)
+
+
+def test_duplicate_index():
+    # GH 28005
+    s = pd.Series([[1, 2], [3, 4]], index=[0, 0])
+    result = s.explode()
+    expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object)
+    tm.assert_series_equal(result, expected)