Skip to content

Commit 986fbba

Browse files
author
Marco Gorelli
committed
Avoid duplicating entire exploded column when joining back with original frame
1 parent 5b0a2a6 commit 986fbba

File tree

4 files changed

+37
-2
lines changed

4 files changed

+37
-2
lines changed

doc/source/whatsnew/v0.25.1.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ Indexing
8484
- Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`)
8585
- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`)
8686
- Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`).
87-
-
87+
- When using :meth:`DataFrame.explode`, don't duplicate entire exploded column when joining back with original frame (:issue:`28005`).
8888

8989
Missing
9090
^^^^^^^

pandas/core/frame.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6240,8 +6240,10 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame":
62406240
result = self[column].explode()
62416241
return (
62426242
self.drop([column], axis=1)
6243-
.join(result)
6243+
.reset_index(drop=True) # GH 28005
6244+
.join(self[column].reset_index(drop=True).explode())
62446245
.reindex(columns=self.columns, copy=False)
6246+
.set_index(result.index) # GH 28005
62456247
)
62466248

62476249
def unstack(self, level=-1, fill_value=None):

pandas/tests/frame/test_explode.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,28 @@ def test_usecase():
118118
index=[0, 0, 1, 1],
119119
)
120120
tm.assert_frame_equal(result, expected)
121+
122+
123+
@pytest.mark.parametrize(
124+
"df, expected",
125+
[
126+
(
127+
pd.DataFrame({"col": [[1, 2], [3, 4]]}, index=[0, 0]),
128+
pd.DataFrame({"col": [1, 2, 3, 4]}, index=[0, 0, 0, 0], dtype=object),
129+
),
130+
(
131+
pd.DataFrame(
132+
{"col": [[1, 2], [3, 4]], "other_col": ["a", "b"]}, index=[0, 0]
133+
),
134+
pd.DataFrame(
135+
{"col": [1, 2, 3, 4], "other_col": ["a", "a", "b", "b"]},
136+
index=[0, 0, 0, 0],
137+
dtype=object,
138+
),
139+
),
140+
],
141+
)
142+
def test_duplicate_index(df, expected):
143+
# GH 28005
144+
result = df.explode("col")
145+
tm.assert_frame_equal(result, expected)

pandas/tests/series/test_explode.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,11 @@ def test_nested_EA():
111111
pd.date_range("20170101", periods=6, tz="UTC"), index=[0, 0, 0, 1, 1, 1]
112112
)
113113
tm.assert_series_equal(result, expected)
114+
115+
116+
def test_duplicate_index():
117+
# GH 28005
118+
s = pd.Series([[1, 2], [3, 4]], index=[0, 0])
119+
result = s.explode()
120+
expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object)
121+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)