Skip to content

API: rename labels to codes in core/groupby #29402

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,16 +655,17 @@ def value_counts(
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

# multi-index components
labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use a list comprehension here

codes = self.grouper.recons_codes
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
names = self.grouper.names + [self._selection_name]

if dropna:
mask = labels[-1] != -1
mask = codes[-1] != -1
if mask.all():
dropna = False
else:
out, labels = out[mask], [label[mask] for label in labels]
out, codes = out[mask], [level_codes[mask] for level_codes in codes]

if normalize:
out = out.astype("float")
Expand All @@ -680,11 +681,11 @@ def value_counts(
if sort and bins is None:
cat = ids[inc][mask] if dropna else ids[inc]
sorter = np.lexsort((out if ascending else -out, cat))
out, labels[-1] = out[sorter], labels[-1][sorter]
out, codes[-1] = out[sorter], codes[-1][sorter]

if bins is None:
mi = MultiIndex(
levels=levels, codes=labels, names=names, verify_integrity=False
levels=levels, codes=codes, names=names, verify_integrity=False
)

if is_integer_dtype(out):
Expand All @@ -694,14 +695,14 @@ def value_counts(
# for compat. with libgroupby.value_counts need to ensure every
# bin is present at every index level, null filled with zeros
diff = np.zeros(len(out), dtype="bool")
for lab in labels[:-1]:
diff |= np.r_[True, lab[1:] != lab[:-1]]
for level_codes in codes[:-1]:
diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]

ncat, nbin = diff.sum(), len(levels[-1])

left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]

right = [diff.cumsum() - 1, labels[-1]]
right = [diff.cumsum() - 1, codes[-1]]

_, idx = _get_join_indexers(left, right, sort=False, how="left")
out = np.where(idx != -1, out[idx], 0)
Expand All @@ -711,7 +712,10 @@ def value_counts(
out, left[-1] = out[sorter], left[-1][sorter]

# build the multi-index w/ full levels
codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1]))
def build_codes(lev_codes: np.ndarray) -> np.ndarray:
return np.repeat(lev_codes[diff], nbin)

codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
codes.append(left[-1])

mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
Expand Down Expand Up @@ -758,7 +762,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
)
)
filled = getattr(self, fill_method)(limit=limit)
fill_grp = filled.groupby(self.grouper.labels)
fill_grp = filled.groupby(self.grouper.codes)
shifted = fill_grp.shift(periods=periods, freq=freq)

return (filled / shifted) - 1
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2349,7 +2349,7 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0
)
)
filled = getattr(self, fill_method)(limit=limit)
fill_grp = filled.groupby(self.grouper.labels)
fill_grp = filled.groupby(self.grouper.codes)
shifted = fill_grp.shift(periods=periods, freq=freq)
return (filled / shifted) - 1

Expand Down
66 changes: 32 additions & 34 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
split-apply-combine paradigm.
"""

from typing import Tuple
from typing import Optional, Tuple
import warnings

import numpy as np
Expand All @@ -21,6 +21,7 @@
)
from pandas.core.dtypes.generic import ABCSeries

from pandas._typing import FrameOrSeries
import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical, ExtensionArray
import pandas.core.common as com
Expand Down Expand Up @@ -228,10 +229,10 @@ class Grouping:
----------
index : Index
grouper :
obj :
obj Union[DataFrame, Series]:
name :
level :
observed : boolean, default False
observed : bool, default False
If we are a Categorical, use the observed values
in_axis : if the Grouping is a column in self.obj and hence among
Groupby.exclusions list
Expand All @@ -240,25 +241,22 @@ class Grouping:
-------
**Attributes**:
* indices : dict of {group -> index_list}
* labels : ndarray, group labels
* ids : mapping of label -> group
* counts : array of group counts
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No attributes on Grouping are actually named ids and counts.

* codes : ndarray, group codes
* group_index : unique groups
* groups : dict of {group -> label_list}
"""

def __init__(
self,
index,
index: Index,
grouper=None,
obj=None,
obj: Optional[FrameOrSeries] = None,
name=None,
level=None,
sort=True,
observed=False,
in_axis=False,
sort: bool = True,
observed: bool = False,
in_axis: bool = False,
):

self.name = name
self.level = level
self.grouper = _convert_grouper(index, grouper)
Expand Down Expand Up @@ -290,12 +288,12 @@ def __init__(
if self.name is None:
self.name = index.names[level]

self.grouper, self._labels, self._group_index = index._get_grouper_for_level( # noqa: E501
self.grouper, self._codes, self._group_index = index._get_grouper_for_level( # noqa: E501
self.grouper, level
)

# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get labels
# as single grouper groupby, use the group_info to get codes
elif isinstance(self.grouper, Grouper):
# get the new grouper; we already have disambiguated
# what key/level refer to exactly, don't need to
Expand All @@ -308,7 +306,7 @@ def __init__(
self.grouper = grouper._get_grouper()

else:
if self.grouper is None and self.name is not None:
if self.grouper is None and self.name is not None and self.obj is not None:
self.grouper = self.obj[self.name]

elif isinstance(self.grouper, (list, tuple)):
Expand All @@ -324,7 +322,7 @@ def __init__(

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
self._labels = self.grouper.codes
self._codes = self.grouper.codes
if observed:
codes = algorithms.unique1d(self.grouper.codes)
codes = codes[codes != -1]
Expand Down Expand Up @@ -380,11 +378,11 @@ def __repr__(self):
def __iter__(self):
return iter(self.indices)

_labels = None
_group_index = None
_codes = None # type: np.ndarray
_group_index = None # type: Index

@property
def ngroups(self):
def ngroups(self) -> int:
return len(self.group_index)

@cache_readonly
Expand All @@ -397,38 +395,38 @@ def indices(self):
return values._reverse_indexer()

@property
def labels(self):
if self._labels is None:
self._make_labels()
return self._labels
def codes(self) -> np.ndarray:
if self._codes is None:
self._make_codes()
return self._codes

@cache_readonly
def result_index(self):
def result_index(self) -> Index:
if self.all_grouper is not None:
return recode_from_groupby(self.all_grouper, self.sort, self.group_index)
return self.group_index

@property
def group_index(self):
def group_index(self) -> Index:
if self._group_index is None:
self._make_labels()
self._make_codes()
return self._group_index

def _make_labels(self):
if self._labels is None or self._group_index is None:
def _make_codes(self) -> None:
if self._codes is None or self._group_index is None:
# we have a list of groupers
if isinstance(self.grouper, BaseGrouper):
labels = self.grouper.label_info
codes = self.grouper.codes_info
uniques = self.grouper.result_index
else:
labels, uniques = algorithms.factorize(self.grouper, sort=self.sort)
codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._codes = codes
self._group_index = uniques

@cache_readonly
def groups(self):
return self.index.groupby(Categorical.from_codes(self.labels, self.group_index))
def groups(self) -> dict:
return self.index.groupby(Categorical.from_codes(self.codes, self.group_index))


def _get_grouper(
Expand Down Expand Up @@ -678,7 +676,7 @@ def _is_label_like(val):
return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))


def _convert_grouper(axis, grouper):
def _convert_grouper(axis: Index, grouper):
if isinstance(grouper, dict):
return grouper.get
elif isinstance(grouper, Series):
Expand Down
Loading