From 8de31ab60b9420d1cc92d756c0e9932ab402db65 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 24 Aug 2024 20:00:19 -0700 Subject: [PATCH] REF: Minimize operations in recode_for_groupby --- pandas/core/groupby/categorical.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 49130d91a0126..90cd8e3ffa1c7 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -46,9 +46,8 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorica # In cases with c.ordered, this is equivalent to # return c.remove_unused_categories(), c - unique_codes = unique1d(c.codes) # type: ignore[no-untyped-call] + take_codes = unique1d(c.codes[c.codes != -1]) # type: ignore[no-untyped-call] - take_codes = unique_codes[unique_codes != -1] if sort: take_codes = np.sort(take_codes) @@ -67,17 +66,18 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool) -> Categorica # sort=False should order groups in as-encountered order (GH-8868) - # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories - all_codes = np.arange(c.categories.nunique()) + # GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories # GH 38140: exclude nan from indexer for categories unique_notnan_codes = unique1d(c.codes[c.codes != -1]) # type: ignore[no-untyped-call] if sort: unique_notnan_codes = np.sort(unique_notnan_codes) - if len(all_codes) > len(unique_notnan_codes): + if (num_cat := len(c.categories)) > len(unique_notnan_codes): # GH 13179: All categories need to be present, even if missing from the data - missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True) + missing_codes = np.setdiff1d( + np.arange(num_cat), unique_notnan_codes, assume_unique=True + ) take_codes = np.concatenate((unique_notnan_codes, missing_codes)) else: take_codes = unique_notnan_codes - return Categorical(c, c.unique().categories.take(take_codes)) + return Categorical(c, c.categories.take(take_codes))