From b7c87418867c7d149c9699cbce4ae9a2ccdafa96 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 4 Nov 2019 22:04:52 +0000 Subject: [PATCH 1/3] rename labels - codes in core/groupby/ --- pandas/core/groupby/generic.py | 20 +++++----- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/grouper.py | 38 +++++++++--------- pandas/core/groupby/ops.py | 56 +++++++++++++-------------- pandas/tests/groupby/test_grouping.py | 4 +- pandas/util/testing.py | 4 +- 6 files changed, 61 insertions(+), 63 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8512b6c3ae530..9009d1e33d496 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -655,16 +655,16 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + codes = list(map(rep, self.grouper.recons_codes)) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] if dropna: - mask = labels[-1] != -1 + mask = codes[-1] != -1 if mask.all(): dropna = False else: - out, labels = out[mask], [label[mask] for label in labels] + out, codes = out[mask], [level_codes[mask] for level_codes in codes] if normalize: out = out.astype("float") @@ -680,11 +680,11 @@ def value_counts( if sort and bins is None: cat = ids[inc][mask] if dropna else ids[inc] sorter = np.lexsort((out if ascending else -out, cat)) - out, labels[-1] = out[sorter], labels[-1][sorter] + out, codes[-1] = out[sorter], codes[-1][sorter] if bins is None: mi = MultiIndex( - levels=levels, codes=labels, names=names, verify_integrity=False + levels=levels, codes=codes, names=names, verify_integrity=False ) if is_integer_dtype(out): @@ -694,14 +694,14 @@ def value_counts( # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros diff = np.zeros(len(out), dtype="bool") - for lab in labels[:-1]: - diff |= np.r_[True, lab[1:] != lab[:-1]] + for codes_ in codes[:-1]: + diff |= np.r_[True, codes_[1:] != codes_[:-1]] ncat, nbin = diff.sum(), len(levels[-1]) left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] - right = [diff.cumsum() - 1, labels[-1]] + right = [diff.cumsum() - 1, codes[-1]] _, idx = _get_join_indexers(left, right, sort=False, how="left") out = np.where(idx != -1, out[idx], 0) @@ -711,7 +711,7 @@ def value_counts( out, left[-1] = out[sorter], left[-1][sorter] # build the multi-index w/ full levels - codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) + codes = list(map(lambda codes: np.repeat(codes[diff], nbin), codes[:-1])) codes.append(left[-1]) mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) @@ -758,7 +758,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): ) ) filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.labels) + fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fa4a184e8f7a4..81ba594c97391 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2349,7 +2349,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 ) ) filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.labels) + fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 45d2a819ae5ad..b7222fe97b3ca 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -59,7 +59,7 @@ class Grouper: `_. axis : number/name of the axis, defaults to 0 sort : bool, default to False - whether to sort the resulting labels + whether to sort the resulting codes closed : {'left' or 'right'} Closed end of interval. Only when `freq` parameter is passed. label : {'left' or 'right'} @@ -231,7 +231,7 @@ class Grouping: obj : name : level : - observed : boolean, default False + observed : bool, default False If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list @@ -240,9 +240,7 @@ class Grouping: ------- **Attributes**: * indices : dict of {group -> index_list} - * labels : ndarray, group labels - * ids : mapping of label -> group - * counts : array of group counts + * codes : ndarray, group codes * group_index : unique groups * groups : dict of {group -> label_list} """ @@ -290,12 +288,12 @@ def __init__( if self.name is None: self.name = index.names[level] - self.grouper, self._labels, self._group_index = index._get_grouper_for_level( # noqa: E501 + self.grouper, self._codes, self._group_index = index._get_grouper_for_level( # noqa: E501 self.grouper, level ) # a passed Grouper like, directly get the grouper in the same way - # as single grouper groupby, use the group_info to get labels + # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to @@ -324,7 +322,7 @@ def __init__( # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes - self._labels = self.grouper.codes + self._codes = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] @@ -380,7 +378,7 @@ def __repr__(self): def __iter__(self): return iter(self.indices) - _labels = None + _codes = None _group_index = None @property @@ -397,10 +395,10 @@ def indices(self): return values._reverse_indexer() @property - def labels(self): - if self._labels is None: - self._make_labels() - return self._labels + def codes(self): + if self._codes is None: + self._make_codes() + return self._codes @cache_readonly def result_index(self): @@ -411,24 +409,24 @@ def result_index(self): @property def group_index(self): if self._group_index is None: - self._make_labels() + self._make_codes() return self._group_index - def _make_labels(self): - if self._labels is None or self._group_index is None: + def _make_codes(self): + if self._codes is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, BaseGrouper): - labels = self.grouper.label_info + codes = self.grouper.codes_info uniques = self.grouper.result_index else: - labels, uniques = algorithms.factorize(self.grouper, sort=self.sort) + codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) - self._labels = labels + self._codes = codes self._group_index = uniques @cache_readonly def groups(self): - return self.index.groupby(Categorical.from_codes(self.labels, self.group_index)) + return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) def _get_grouper( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5bad73bf40ff5..1ad7771c732d5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -139,7 +139,7 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, ngroups, self.levels, self.labels) + return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) def apply(self, f, data, axis: int = 0): mutated = self.mutated @@ -210,13 +210,13 @@ def indices(self): if len(self.groupings) == 1: return self.groupings[0].indices else: - label_list = [ping.labels for ping in self.groupings] + codes_list = [ping.codes for ping in self.groupings] keys = [com.values_from_object(ping.group_index) for ping in self.groupings] - return get_indexer_dict(label_list, keys) + return get_indexer_dict(codes_list, keys) @property - def labels(self): - return [ping.labels for ping in self.groupings] + def codes(self): + return [ping.codes for ping in self.groupings] @property def levels(self): @@ -256,46 +256,46 @@ def is_monotonic(self) -> bool: @cache_readonly def group_info(self): - comp_ids, obs_group_ids = self._get_compressed_labels() + comp_ids, obs_group_ids = self._get_compressed_codes() ngroups = len(obs_group_ids) comp_ids = ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups @cache_readonly - def label_info(self): - # return the labels of items in original grouped axis - labels, _, _ = self.group_info + def codes_info(self): + # return the codes of items in original grouped axis + codes, _, _ = self.group_info if self.indexer is not None: - sorter = np.lexsort((labels, self.indexer)) - labels = labels[sorter] - return labels - - def _get_compressed_labels(self): - all_labels = [ping.labels for ping in self.groupings] - if len(all_labels) > 1: - group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) + sorter = np.lexsort((codes, self.indexer)) + codes = codes[sorter] + return codes + + def _get_compressed_codes(self): + all_codes = [ping.codes for ping in self.groupings] + if len(all_codes) > 1: + group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] - return ping.labels, np.arange(len(ping.group_index)) + return ping.codes, np.arange(len(ping.group_index)) @cache_readonly def ngroups(self) -> int: return len(self.result_index) @property - def recons_labels(self): + def recons_codes(self): comp_ids, obs_ids, _ = self.group_info - labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels, xnull=True) + codes = (ping.codes for ping in self.groupings) + return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) @cache_readonly def result_index(self): if not self.compressed and len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) - codes = self.recons_labels + codes = self.recons_codes levels = [ping.result_index for ping in self.groupings] result = MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names @@ -307,9 +307,9 @@ def get_group_levels(self): return [self.groupings[0].result_index] name_list = [] - for ping, labels in zip(self.groupings, self.recons_labels): - labels = ensure_platform_int(labels) - levels = ping.result_index.take(labels) + for ping, codes in zip(self.groupings, self.recons_codes): + codes = ensure_platform_int(codes) + levels = ping.result_index.take(codes) name_list.append(levels) @@ -490,7 +490,7 @@ def _cython_operation( else: out_dtype = "object" - labels, _, _ = self.group_info + codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( @@ -498,7 +498,7 @@ def _cython_operation( ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_datetimelike, min_count + result, counts, values, codes, func, is_datetimelike, min_count ) elif kind == "transform": result = _maybe_fill( @@ -507,7 +507,7 @@ def _cython_operation( # TODO: min_count result = self._transform( - result, values, labels, func, is_datetimelike, **kwargs + result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e1fd8d7da6833..e4edc64016567 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -559,12 +559,12 @@ def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) - tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels) + tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_grouping_labels(self, mframe): grouped = mframe.groupby(mframe.index.get_level_values(0)) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) - tm.assert_almost_equal(grouped.grouper.labels[0], exp_labels) + tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels) def test_list_grouper_with_nat(self): # GH 14715 diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 5a2f189ad8d10..4ba32c377a345 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -621,8 +621,8 @@ def _check_types(l, r, obj="Index"): def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] - labels = index.codes[level] - filled = take_1d(unique.values, labels, fill_value=unique._na_value) + level_codes = index.codes[level] + filled = take_1d(unique.values, level_codes, fill_value=unique._na_value) values = unique._shallow_copy(filled, name=index.names[level]) return values From 2c1a71eec2882a8867c2618560eaa4b63b4e9771 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 5 Nov 2019 00:44:12 +0000 Subject: [PATCH 2/3] Various changes --- pandas/core/groupby/generic.py | 12 ++++++++---- pandas/core/groupby/grouper.py | 18 +++++++++--------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9009d1e33d496..511b87dab087e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -655,7 +655,8 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - codes = list(map(rep, self.grouper.recons_codes)) + [llab(lab, inc)] + codes = self.grouper.recons_codes + codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] @@ -694,8 +695,8 @@ def value_counts( # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros diff = np.zeros(len(out), dtype="bool") - for codes_ in codes[:-1]: - diff |= np.r_[True, codes_[1:] != codes_[:-1]] + for level_codes in codes[:-1]: + diff |= np.r_[True, level_codes[1:] != level_codes[:-1]] ncat, nbin = diff.sum(), len(levels[-1]) @@ -711,7 +712,10 @@ def value_counts( out, left[-1] = out[sorter], left[-1][sorter] # build the multi-index w/ full levels - codes = list(map(lambda codes: np.repeat(codes[diff], nbin), codes[:-1])) + def build_codes(lev_codes: np.ndarray) -> np.ndarray: + return np.repeat(lev_codes[diff], nbin) + + codes = [build_codes(lev_codes) for lev_codes in codes[:-1]] codes.append(left[-1]) mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index b7222fe97b3ca..6ccdacb077d32 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -59,7 +59,7 @@ class Grouper: `_. axis : number/name of the axis, defaults to 0 sort : bool, default to False - whether to sort the resulting codes + whether to sort the resulting labels closed : {'left' or 'right'} Closed end of interval. Only when `freq` parameter is passed. label : {'left' or 'right'} @@ -378,11 +378,11 @@ def __repr__(self): def __iter__(self): return iter(self.indices) - _codes = None - _group_index = None + _codes = None # type: np.ndarray + _group_index = None # type: Index @property - def ngroups(self): + def ngroups(self) -> int: return len(self.group_index) @cache_readonly @@ -395,24 +395,24 @@ def indices(self): return values._reverse_indexer() @property - def codes(self): + def codes(self) -> np.ndarray: if self._codes is None: self._make_codes() return self._codes @cache_readonly - def result_index(self): + def result_index(self) -> Index: if self.all_grouper is not None: return recode_from_groupby(self.all_grouper, self.sort, self.group_index) return self.group_index @property - def group_index(self): + def group_index(self) -> Index: if self._group_index is None: self._make_codes() return self._group_index - def _make_codes(self): + def _make_codes(self) -> None: if self._codes is None or self._group_index is None: # we have a list of groupers if isinstance(self.grouper, BaseGrouper): @@ -425,7 +425,7 @@ def _make_codes(self): self._group_index = uniques @cache_readonly - def groups(self): + def groups(self) -> dict: return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) From 785e2f9abd837169c5e047c926fc820ac494e40c Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 6 Nov 2019 21:17:50 +0000 Subject: [PATCH 3/3] more changes --- pandas/core/groupby/grouper.py | 20 ++++++++++---------- pandas/core/groupby/ops.py | 25 +++++++++++++------------ 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 6ccdacb077d32..dc6336b17ac1e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -3,7 +3,7 @@ split-apply-combine paradigm. """ -from typing import Tuple +from typing import Optional, Tuple import warnings import numpy as np @@ -21,6 +21,7 @@ ) from pandas.core.dtypes.generic import ABCSeries +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com @@ -228,7 +229,7 @@ class Grouping: ---------- index : Index grouper : - obj : + obj Union[DataFrame, Series]: name : level : observed : bool, default False @@ -247,16 +248,15 @@ class Grouping: def __init__( self, - index, + index: Index, grouper=None, - obj=None, + obj: Optional[FrameOrSeries] = None, name=None, level=None, - sort=True, - observed=False, - in_axis=False, + sort: bool = True, + observed: bool = False, + in_axis: bool = False, ): - self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) @@ -306,7 +306,7 @@ def __init__( self.grouper = grouper._get_grouper() else: - if self.grouper is None and self.name is not None: + if self.grouper is None and self.name is not None and self.obj is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): @@ -676,7 +676,7 @@ def _is_label_like(val): return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) -def _convert_grouper(axis, grouper): +def _convert_grouper(axis: Index, grouper): if isinstance(grouper, dict): return grouper.get elif isinstance(grouper, Series): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1ad7771c732d5..2c8aa1294451d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -7,7 +7,7 @@ """ import collections -from typing import List, Optional, Type +from typing import List, Optional, Sequence, Type import numpy as np @@ -41,7 +41,7 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base +from pandas.core.groupby import base, grouper from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( @@ -62,13 +62,13 @@ class BaseGrouper: Parameters ---------- axis : Index - groupings : array of grouping + groupings : Sequence[Grouping] all the grouping instances to handle in this grouper for example for grouper list to groupby, need to pass the list - sort : boolean, default True + sort : bool, default True whether this grouper will give sorted result or not - group_keys : boolean, default True - mutated : boolean, default False + group_keys : bool, default True + mutated : bool, default False indexer : intp array, optional the indexer created by Grouper some groupers (TimeGrouper) will sort its axis and its @@ -79,16 +79,17 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings, - sort=True, - group_keys=True, - mutated=False, - indexer=None, + groupings: "Sequence[grouper.Grouping]", + sort: bool = True, + group_keys: bool = True, + mutated: bool = False, + indexer: Optional[np.ndarray] = None, ): assert isinstance(axis, Index), axis + self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self.groupings = groupings + self.groupings = groupings # type: Sequence[grouper.Grouping] self.sort = sort self.group_keys = group_keys self.mutated = mutated