From 9e51ab09a347dbbc9a0734dc6d76387a20abd091 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Mon, 7 Mar 2022 19:19:15 +0000 Subject: [PATCH 1/4] REF: Unify use of `factorize()` in groupby Grouping._codes_and_uniques and BaseGrouper._get_compressed_codes were both semantically doing the same thing as core.algorithms.factorize(), so rename them so that it's just a bit easier to follow. Per a review comment in https://github.com/pandas-dev/pandas/pull/46207 --- pandas/core/groupby/grouper.py | 9 +++++---- pandas/core/groupby/ops.py | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3bffe59905a69..5afc2668307ed 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -619,7 +619,7 @@ def codes(self) -> npt.NDArray[np.signedinteger]: # _codes is set in __init__ for MultiIndex cases return self._codes - return self._codes_and_uniques[0] + return self._factorize[0] @cache_readonly def group_arraylike(self) -> ArrayLike: @@ -635,7 +635,7 @@ def group_arraylike(self) -> ArrayLike: # retain dtype for categories, including unobserved ones return self.result_index._values - return self._codes_and_uniques[1] + return self._factorize[1] @cache_readonly def result_index(self) -> Index: @@ -653,11 +653,12 @@ def group_index(self) -> Index: # _group_index is set in __init__ for MultiIndex cases return self._group_index - uniques = self._codes_and_uniques[1] + uniques = self._factorize[1] return Index._with_infer(uniques, name=self.name) @cache_readonly - def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: + def _factorize(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: + """Analogous to core.algorithms.factorize""" if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e4e42e7a1178e..c9381652da970 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -878,7 +878,7 @@ def has_dropped_na(self) -> bool: @cache_readonly def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: - comp_ids, obs_group_ids = self._get_compressed_codes() + comp_ids, obs_group_ids = self._factorize() ngroups = len(obs_group_ids) comp_ids = ensure_platform_int(comp_ids) @@ -899,10 +899,10 @@ def codes_info(self) -> npt.NDArray[np.intp]: return ids @final - def _get_compressed_codes( + def _factorize( self, ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]: - # The first returned ndarray may have any signed integer dtype + """Analogous to core.algorithms.factorize""" if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self._sort) From 3d80d3de78350ae0a7656709a5423ba798d426ba Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Mon, 7 Mar 2022 20:53:47 +0000 Subject: [PATCH 2/4] TYP: Add Literal annotations to TimeGrouper --- pandas/core/resample.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 53d75255da536..b6a0c988fa69a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1466,14 +1466,14 @@ def __init__( self, freq="Min", closed: Literal["left", "right"] | None = None, - label: str | None = None, + label: Literal["left", "right"] | None = None, how="mean", axis=0, fill_method=None, limit=None, loffset=None, kind: str | None = None, - convention: str | None = None, + convention: Literal["start", "end", "e", "s"] | None = None, base: int | None = None, origin: str | TimestampConvertibleTypes = "start_day", offset: TimedeltaConvertibleTypes | None = None, @@ -1519,8 +1519,7 @@ def __init__( self.label = label self.kind = kind - self.convention = convention or "E" - self.convention = self.convention.lower() + self.convention = convention or "e" self.how = how self.fill_method = fill_method From 28a797388dcfa2a1d13ee0e7e604d5ac0d655a36 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Mon, 7 Mar 2022 21:02:08 +0000 Subject: [PATCH 3/4] TYP: Fix annotation of Grouper._set_grouper The return value is never used. Don't pretend that it is. --- pandas/core/groupby/grouper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5afc2668307ed..260acaf954218 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -339,7 +339,7 @@ def _get_grouper( return self.binner, self.grouper, self.obj # type: ignore[return-value] @final - def _set_grouper(self, obj: NDFrame, sort: bool = False): + def _set_grouper(self, obj: NDFrame, sort: bool = False) -> None: """ given an object and the specifications, setup the internal grouper for this particular specification @@ -413,7 +413,6 @@ def _set_grouper(self, obj: NDFrame, sort: bool = False): # "NDFrameT", variable has type "None") self.obj = obj # type: ignore[assignment] self._gpr_index = ax - return self._gpr_index @final @property From 8086c31a42323061c0209d479dc583d82610bab8 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Mon, 7 Mar 2022 22:36:05 +0000 Subject: [PATCH 4/4] BUG: Include "dropna" in Grouper._attributes and move dropna up out of the "generated" attribute section into the "source" attribute section --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 260acaf954218..3869c30374bcf 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -263,7 +263,7 @@ class Grouper: _gpr_index: Index | None _grouper: Index | None - _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort") + _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna") def __new__(cls, *args, **kwargs): if kwargs.get("freq") is not None: @@ -287,6 +287,7 @@ def __init__( self.freq = freq self.axis = axis self.sort = sort + self.dropna = dropna self.grouper = None self._gpr_index = None @@ -295,7 +296,6 @@ def __init__( self.binner = None self._grouper = None self._indexer = None - self.dropna = dropna @final @property