Skip to content

Commit 1f797c6

Browse files
committed
Merge pull request #10142 from mortada/groupby_apply_cat
ENH: groupby.apply for Categorical should preserve categories (closes…
2 parents 93150ba + c8bf1c4 commit 1f797c6

File tree

3 files changed

+32
-2
lines changed

3 files changed

+32
-2
lines changed

doc/source/whatsnew/v0.16.2.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ Bug Fixes
5858
multi-indexed (:issue:`7212`)
5959
- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
6060

61-
61+
- Bug in groupby.apply aggregation for Categorical not preserving categories (:issue:`10138`)
6262
- Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`)
6363
- Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`)
6464
- Bug in ``Index.union`` raises ``AttributeError`` when passing array-likes. (:issue:`10149`)

pandas/core/groupby.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2944,7 +2944,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
29442944
cd = 'coerce'
29452945
else:
29462946
cd = True
2947-
return result.convert_objects(convert_dates=cd)
2947+
result = result.convert_objects(convert_dates=cd)
2948+
return self._reindex_output(result)
29482949

29492950
else:
29502951
# only coerce dates if we find at least 1 datetime

pandas/tests/test_groupby.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2595,6 +2595,35 @@ def get_stats(group):
25952595
result = self.df.groupby(cats).D.apply(get_stats)
25962596
self.assertEqual(result.index.names[0], 'C')
25972597

2598+
def test_apply_categorical_data(self):
2599+
# GH 10138
2600+
for ordered in [True, False]:
2601+
dense = Categorical(list('abc'), ordered=ordered)
2602+
# 'b' is in the categories but not in the list
2603+
missing = Categorical(list('aaa'), categories=['a', 'b'], ordered=ordered)
2604+
values = np.arange(len(dense))
2605+
df = DataFrame({'missing': missing,
2606+
'dense': dense,
2607+
'values': values})
2608+
grouped = df.groupby(['missing', 'dense'])
2609+
2610+
# missing category 'b' should still exist in the output index
2611+
idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']],
2612+
names=['missing', 'dense'])
2613+
expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan],
2614+
index=idx,
2615+
columns=['values'])
2616+
2617+
assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
2618+
assert_frame_equal(grouped.mean(), expected)
2619+
assert_frame_equal(grouped.agg(np.mean), expected)
2620+
2621+
# but for transform we should still get back the original index
2622+
idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']],
2623+
names=['missing', 'dense'])
2624+
expected = Series(1, index=idx)
2625+
assert_series_equal(grouped.apply(lambda x: 1), expected)
2626+
25982627
def test_apply_corner_cases(self):
25992628
# #535, can't use sliding iterator
26002629

0 commit comments

Comments
 (0)