Skip to content

Commit fce6998

Browse files
committed
passing groupy valcount tests
1 parent 797f668 commit fce6998

File tree

4 files changed

+206
-106
lines changed

4 files changed

+206
-106
lines changed

pandas/core/algorithms.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -723,15 +723,15 @@ def value_counts(
723723

724724
# count, remove nulls (from the index), and use the bins
725725
result = ii.value_counts(dropna=dropna)
726+
print(f"{result=}")
726727
result.index = result.index.astype("interval")
727728
result = result.sort_index()
728729

730+
"""
729731
# if we are dropna and we have NO values
730732
if dropna and (result._values == 0).all():
731733
result = result.iloc[0:0]
732-
733-
# normalizing is by len of what gets included in the bins
734-
counts = result._values
734+
"""
735735

736736
else:
737737

@@ -740,19 +740,18 @@ def value_counts(
740740
# handle Categorical and sparse,
741741
result = Series(values)._values.value_counts(dropna=dropna)
742742
result.name = name
743-
counts = result._values
744743

745744
else:
746745
keys, counts = _value_counts_arraylike(values, dropna)
747746

748747
result = Series(counts, index=keys, name=name)
749748

750-
if sort:
751-
result = result.sort_values(ascending=ascending)
752-
753749
if normalize:
754-
result = result / float(counts.sum())
750+
counts = result._values
751+
result = result / float(max(counts.sum(), 1))
755752

753+
if sort:
754+
result = result.sort_values(ascending=ascending)
756755
return result
757756

758757

pandas/core/groupby/generic.py

Lines changed: 151 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
is_scalar,
5252
needs_i8_conversion,
5353
)
54+
from pandas.core.dtypes.dtypes import CategoricalDtype
5455
from pandas.core.dtypes.missing import isna, notna
5556

5657
from pandas.core.aggregation import (
@@ -664,16 +665,7 @@ def describe(self, **kwargs):
664665
def value_counts(
665666
self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
666667
):
667-
return self.apply(
668-
Series.value_counts,
669-
normalize=normalize,
670-
sort=sort,
671-
ascending=ascending,
672-
bins=bins,
673-
dropna=dropna,
674-
)
675668

676-
"""
677669
from pandas.core.reshape.tile import cut
678670
from pandas.core.reshape.merge import _get_join_indexers
679671

@@ -687,115 +679,184 @@ def value_counts(
687679
ascending=ascending,
688680
bins=bins,
689681
)
690-
682+
keys = [k for k in self.groups]
683+
# print(f'{self.groups=}')
691684
ids, _, _ = self.grouper.group_info
685+
# print(f'{ids=}')
692686
val = self.obj._values
687+
print(f"{keys=}")
688+
codes = self.grouper.reconstructed_codes # this will track the groups
689+
print("codes: ", codes)
693690

694691
# groupby removes null keys from groupings
695692
mask = ids != -1
696693
ids, val = ids[mask], val[mask]
694+
if dropna:
695+
mask = ~np.isnan(val)
696+
if not mask.all():
697+
ids, val = ids[mask], val[mask]
698+
# codes = [code[mask] for code in codes]
697699

700+
print(f"{ids=}")
701+
print(f"{val=}")
702+
703+
print(f"{bins=}")
698704
if bins is None:
699-
lab, lev = algorithms.factorize(val, sort=True)
700-
llab = lambda lab, inc: lab[inc]
705+
val_lab, val_lev = algorithms.factorize(val, sort=True, dropna=dropna)
706+
print(f"{val_lab=}")
701707
else:
708+
# val_lab is a Categorical with categories an IntervalIndex
709+
print(f"{Series(val)=}")
710+
val_lab = cut(Series(val), bins, include_lowest=True)
711+
# cut excludes NaN from its categories, so need to manually add
712+
print(f"{val_lab=}")
713+
print((not dropna) and (val_lab.hasnans))
714+
"""if (not dropna) and (val_lab.hasnans):
715+
# val_lab =
716+
cat_nan = CategoricalDtype(val_lab.cat.add_categories('NaN').cat.categories)
717+
print(cat_nan)
718+
val_lab = val_lab.astype(cat_nan).fillna('NaN')
719+
"""
720+
print(f"{val_lab=}")
721+
val_lev = val_lab.cat.categories
722+
val_lab = val_lab.cat.codes.values
723+
print(f"{val_lab=}")
724+
if dropna:
725+
included = val_lab != -1
726+
ids, val_lab = ids[included], val_lab[included]
702727

703-
# lab is a Categorical with categories an IntervalIndex
704-
lab = cut(Series(val), bins, include_lowest=True)
705-
lev = lab.cat.categories
706-
lab = lev.take(lab.cat.codes)
707-
llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
708-
709-
if is_interval_dtype(lab.dtype):
728+
# print('1st val_lab: ', val_lab.cat.codes)
729+
# llab = lambda val_lab, inc: val_lab[inc]._multiindex.codes[-1]
730+
print(f"{val_lev=}")
731+
if is_interval_dtype(val_lab.dtype):
710732
# TODO: should we do this inside II?
711-
sorter = np.lexsort((lab.left, lab.right, ids))
733+
sorter = np.lexsort((val_lab.right, val_lab.left, ids))
712734
else:
713-
sorter = np.lexsort((lab, ids))
714-
715-
ids, lab = ids[sorter], lab[sorter]
735+
sorter = np.lexsort((val_lab, ids))
736+
ids, val_lab = ids[sorter], val_lab[sorter]
716737

738+
print("ids: ", ids)
739+
print(f"{val_lab=}")
740+
# val_lab = val_lab.values
741+
# print(f'{val_lab=}')
717742
# group boundaries are where group ids change
718-
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
719-
720743
# new values are where sorted labels change
721-
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
722-
inc = np.r_[True, lchanges]
723-
inc[idx] = True # group boundaries are also new values
724-
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
725-
726-
# num. of times each group should be repeated
727-
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
728-
729-
# multi-index components
730-
codes = self.grouper.reconstructed_codes
731-
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
732-
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
744+
change_ids = ids[1:] != ids[:-1]
745+
print((val_lab[1:] != val_lab[:-1]))
746+
changes = np.logical_or(change_ids, (val_lab[1:] != val_lab[:-1]))
747+
"""
748+
changes = [(ids[i] != ids[i+1]) or (val_lab[i] != val_lab[i+1])
749+
for i in range(len(ids)-1)] #((ids[1:] != ids[:-1]) or (val_lab[1:] != val_lab[:-1]))
750+
"""
751+
print(f"{changes=}")
752+
print(np.diff(np.nonzero(changes), append=len(changes))[0])
753+
changes = np.r_[True, changes]
754+
cts = np.diff(np.nonzero(np.r_[changes, True]))[0] # , append=len(changes))[0]
755+
print(f"{cts=}")
756+
val_lab = val_lab[changes]
757+
ids = ids[changes]
758+
print("ids: ", ids)
759+
760+
change_ids = (
761+
ids[1:] != ids[:-1]
762+
) # need to update now that we removed full repeats
763+
# num_id_rep = np.diff(np.nonzero(np.r_[True, chan]))
764+
print(f"{change_ids=}")
765+
print(f"{val_lab=}")
766+
767+
num_repeats = np.diff(np.nonzero(np.r_[True, change_ids, True]))[0]
768+
rep = partial(np.repeat, repeats=num_repeats)
769+
print(f"{rep=}")
770+
if (not dropna) and (-1 in val_lab):
771+
val_lev = np.r_[Index([np.nan]), val_lev]
772+
val_lab += 1
773+
levels = [ping.group_index for ping in self.grouper.groupings] + [
774+
Index(val_lev)
775+
]
776+
print(f"{levels=}")
733777
names = self.grouper.names + [self._selection_name]
734-
735-
if dropna:
736-
mask = codes[-1] != -1
737-
if mask.all():
738-
dropna = False
739-
else:
740-
out, codes = out[mask], [level_codes[mask] for level_codes in codes]
778+
print(f"{names=}")
741779

742780
if normalize:
743-
out = out.astype("float")
744-
d = np.diff(np.r_[idx, len(ids)])
745-
if dropna:
746-
m = ids[lab == -1]
747-
np.add.at(d, m, -1)
748-
acc = rep(d)[mask]
749-
else:
750-
acc = rep(d)
751-
out /= acc
752-
753-
if sort and bins is None:
754-
cat = ids[inc][mask] if dropna else ids[inc]
755-
sorter = np.lexsort((out if ascending else -out, cat))
756-
out, codes[-1] = out[sorter], codes[-1][sorter]
781+
num_vals = []
782+
ix = 0
783+
print(f"{num_repeats=}")
784+
for i, r in enumerate(num_repeats):
785+
num_vals.append(np.sum(cts[ix : ix + r]))
786+
# print(out[ix:ix+r])
787+
ix += r
788+
# print(f'{ix=}')
789+
# [np.sum(out[i:i+r]) ]
790+
print(f"{num_vals=}")
791+
print(f"{cts=}")
792+
cts = cts.astype("float")
793+
cts /= rep(num_vals) # each divisor is the number of repeats for that index
794+
print(f"{cts=}")
757795

758796
if bins is None:
797+
print("codes: ", codes)
798+
# codes = [code[changes] for code in codes]
799+
used_ids = np.unique(ids)
800+
codes = [code[used_ids] for code in codes]
801+
codes = [rep(level_codes) for level_codes in codes] + [val_lab]
802+
print(f"{codes=}")
803+
804+
if sort:
805+
indices = tuple(reversed(codes[:-1]))
806+
sorter = np.lexsort(
807+
np.r_[[val_lab], [cts if ascending else -cts], indices]
808+
) # sorts using right columns first
809+
cts = cts[sorter]
810+
codes = [code[sorter] for code in codes]
811+
print(f"{cts=}")
759812
mi = MultiIndex(
760813
levels=levels, codes=codes, names=names, verify_integrity=False
761814
)
815+
# print(f'{mi=}')
816+
if is_integer_dtype(cts):
817+
cts = ensure_int64(cts)
818+
return self.obj._constructor(cts, index=mi, name=self._selection_name)
762819

763-
if is_integer_dtype(out):
764-
out = ensure_int64(out)
765-
return self.obj._constructor(out, index=mi, name=self._selection_name)
820+
nbin = len(levels[-1])
821+
# print(f'{codes=}')
822+
print(len(cts), len(codes[0]), len(sorter))
766823

767824
# for compat. with libgroupby.value_counts need to ensure every
768825
# bin is present at every index level, null filled with zeros
769-
diff = np.zeros(len(out), dtype="bool")
770-
for level_codes in codes[:-1]:
771-
diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
772-
773-
ncat, nbin = diff.sum(), len(levels[-1])
774-
775-
left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
776-
777-
right = [diff.cumsum() - 1, codes[-1]]
778-
779-
_, idx = _get_join_indexers(left, right, sort=False, how="left")
780-
out = np.where(idx != -1, out[idx], 0)
781-
826+
print(f"{ids=}")
827+
ncat = len(codes[0])
828+
# ncat = len(ids)
829+
print(f"{nbin=}")
830+
fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64)
831+
for i, ct in enumerate(cts):
832+
fout[ids[i] * nbin + val_lab[i]] = ct
833+
print(f"{fout=}", len(fout))
834+
835+
ncodes = [np.repeat(code, nbin) for code in codes]
836+
print(f"{ncodes=}")
837+
ncodes.append(np.tile(range(nbin), len(codes[0])))
838+
"""
839+
fout = cts
840+
ncodes = [rep(level_codes) for level_codes in codes] + [val_lab]
841+
"""
842+
print(f"{ncodes=}")
782843
if sort:
783-
sorter = np.lexsort((out if ascending else -out, left[0]))
784-
out, left[-1] = out[sorter], left[-1][sorter]
785-
786-
# build the multi-index w/ full levels
787-
def build_codes(lev_codes: np.ndarray) -> np.ndarray:
788-
return np.repeat(lev_codes[diff], nbin)
789-
790-
codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
791-
codes.append(left[-1])
792-
793-
mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
794-
795-
if is_integer_dtype(out):
796-
out = ensure_int64(out)
797-
return self.obj._constructor(out, index=mi, name=self._selection_name)
798-
"""
844+
indices = tuple(reversed(ncodes[:-1]))
845+
print(f"{indices=}")
846+
# print(np.r_[[fout if ascending else -fout], indices])
847+
sorter = np.lexsort(
848+
np.r_[[fout if ascending else -fout], indices]
849+
) # sorts using right columns first
850+
# print(sorter)
851+
fout = fout[sorter]
852+
ncodes = [code[sorter] for code in ncodes]
853+
mi = MultiIndex(
854+
levels=levels, codes=ncodes, names=names, verify_integrity=False
855+
)
856+
print(f"{mi=}")
857+
if is_integer_dtype(fout):
858+
fout = ensure_int64(fout)
859+
return self.obj._constructor(fout, index=mi, name=self._selection_name)
799860

800861
def count(self) -> Series:
801862
"""

0 commit comments

Comments
 (0)