Skip to content

Commit c9a4383

Browse files
committed
passing all value count tests
1 parent 637a609 commit c9a4383

File tree

3 files changed

+52
-129
lines changed

3 files changed

+52
-129
lines changed

pandas/core/groupby/generic.py

Lines changed: 34 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,11 @@
4545
ensure_platform_int,
4646
is_bool,
4747
is_integer_dtype,
48-
is_interval_dtype,
4948
is_numeric_dtype,
5049
is_object_dtype,
5150
is_scalar,
5251
needs_i8_conversion,
5352
)
54-
from pandas.core.dtypes.dtypes import CategoricalDtype
5553
from pandas.core.dtypes.missing import isna, notna
5654

5755
from pandas.core.aggregation import (
@@ -61,6 +59,7 @@
6159
validate_func_kwargs,
6260
)
6361
import pandas.core.algorithms as algorithms
62+
from pandas.core.algorithms import unique
6463
from pandas.core.base import DataError, SpecificationError
6564
import pandas.core.common as com
6665
from pandas.core.construction import create_series_with_explicit_dtype
@@ -78,6 +77,7 @@
7877
import pandas.core.indexes.base as ibase
7978
from pandas.core.internals import BlockManager, make_block
8079
from pandas.core.series import Series
80+
from pandas.core.sorting import compress_group_index
8181
from pandas.core.util.numba_ import (
8282
NUMBA_FUNC_CACHE,
8383
generate_numba_func,
@@ -667,7 +667,6 @@ def value_counts(
667667
):
668668

669669
from pandas.core.reshape.tile import cut
670-
from pandas.core.reshape.merge import _get_join_indexers
671670

672671
if bins is not None and not np.iterable(bins):
673672
# scalar bins cannot be done at top level
@@ -679,14 +678,9 @@ def value_counts(
679678
ascending=ascending,
680679
bins=bins,
681680
)
682-
keys = [k for k in self.groups]
683-
# print(f'{self.groups=}')
684681
ids, _, _ = self.grouper.group_info
685-
# print(f'{ids=}')
686682
val = self.obj._values
687-
print(f"{keys=}")
688683
codes = self.grouper.reconstructed_codes # this will track the groups
689-
print("codes: ", codes)
690684

691685
# groupby removes null keys from groupings
692686
mask = ids != -1
@@ -695,111 +689,63 @@ def value_counts(
695689
mask = ~isna(val)
696690
if not mask.all():
697691
ids, val = ids[mask], val[mask]
698-
# codes = [code[mask] for code in codes]
699692

700-
print(f"{ids=}")
701-
print(f"{val=}")
702-
703-
print(f"{bins=}")
704693
if bins is None:
705694
val_lab, val_lev = algorithms.factorize(val, sort=True, dropna=dropna)
706-
print(f"{val_lab=}")
707695
else:
708696
# val_lab is a Categorical with categories an IntervalIndex
709-
print(f"{Series(val)=}")
710697
val_lab = cut(Series(val), bins, include_lowest=True)
711-
# cut excludes NaN from its categories, so need to manually add
712-
print(f"{val_lab=}")
713-
print((not dropna) and (val_lab.hasnans))
714-
"""if (not dropna) and (val_lab.hasnans):
715-
# val_lab =
716-
cat_nan = CategoricalDtype(val_lab.cat.add_categories('NaN').cat.categories)
717-
print(cat_nan)
718-
val_lab = val_lab.astype(cat_nan).fillna('NaN')
719-
"""
720-
print(f"{val_lab=}")
721698
val_lev = val_lab.cat.categories
722699
val_lab = val_lab.cat.codes.values
723-
print(f"{val_lab=}")
724-
if dropna:
725-
included = val_lab != -1
726-
ids, val_lab = ids[included], val_lab[included]
727-
728-
# print('1st val_lab: ', val_lab.cat.codes)
729-
# llab = lambda val_lab, inc: val_lab[inc]._multiindex.codes[-1]
730-
print(f"{val_lev=}")
731-
if is_interval_dtype(val_lab.dtype):
732-
# TODO: should we do this inside II?
733-
sorter = np.lexsort((val_lab.right, val_lab.left, ids))
734-
else:
735-
sorter = np.lexsort((val_lab, ids))
700+
701+
if dropna:
702+
included = val_lab != -1
703+
ids, val_lab = ids[included], val_lab[included]
704+
705+
sorter = np.lexsort((val_lab, ids))
736706
ids, val_lab = ids[sorter], val_lab[sorter]
707+
used_ids = unique(ids)
708+
if max(used_ids) >= len(
709+
codes[0]
710+
): # this means we had something skipped from the start
711+
used_ids = compress_group_index(used_ids)[0]
712+
codes = [code[used_ids] for code in codes] # drop what was taken out for n/a
737713

738-
print("ids: ", ids)
739-
print(f"{val_lab=}")
740-
# val_lab = val_lab.values
741-
# print(f'{val_lab=}')
742714
# group boundaries are where group ids change
743715
# new values are where sorted labels change
744716
change_ids = ids[1:] != ids[:-1]
745-
print((val_lab[1:] != val_lab[:-1]))
746717
changes = np.logical_or(change_ids, (val_lab[1:] != val_lab[:-1]))
747-
"""
748-
changes = [(ids[i] != ids[i+1]) or (val_lab[i] != val_lab[i+1])
749-
for i in range(len(ids)-1)] #((ids[1:] != ids[:-1]) or (val_lab[1:] != val_lab[:-1]))
750-
"""
751-
print(f"{changes=}")
752-
print(np.diff(np.nonzero(changes), append=len(changes))[0])
753718
changes = np.r_[True, changes]
754-
cts = np.diff(np.nonzero(np.r_[changes, True]))[0] # , append=len(changes))[0]
755-
print(f"{cts=}")
756719
val_lab = val_lab[changes]
757720
ids = ids[changes]
758-
print("ids: ", ids)
759-
760-
change_ids = (
761-
ids[1:] != ids[:-1]
762-
) # need to update now that we removed full repeats
763-
# num_id_rep = np.diff(np.nonzero(np.r_[True, chan]))
764-
print(f"{change_ids=}")
765-
print(f"{val_lab=}")
721+
cts = np.diff(np.nonzero(np.r_[changes, True]))[0]
766722

723+
idx = np.r_[0, 1 + np.nonzero(change_ids)[0]]
724+
rep = partial(np.repeat, repeats=np.add.reduceat(changes, idx))
767725
num_repeats = np.diff(np.nonzero(np.r_[True, change_ids, True]))[0]
768-
rep = partial(np.repeat, repeats=num_repeats)
769-
print(f"{rep=}")
726+
727+
change_ids = np.r_[ # need to update now that we removed full repeats
728+
ids[1:] != ids[:-1], True
729+
]
730+
770731
if (not dropna) and (-1 in val_lab):
732+
# in this case we need to explicitly add NaN as a level
771733
val_lev = np.r_[Index([np.nan]), val_lev]
772734
val_lab += 1
735+
773736
levels = [ping.group_index for ping in self.grouper.groupings] + [
774737
Index(val_lev)
775738
]
776-
print(f"{levels=}")
777739
names = self.grouper.names + [self._selection_name]
778-
print(f"{names=}")
779740

780741
if normalize:
781-
num_vals = []
782-
ix = 0
783-
print(f"{num_repeats=}")
784-
for i, r in enumerate(num_repeats):
785-
num_vals.append(np.sum(cts[ix : ix + r]))
786-
# print(out[ix:ix+r])
787-
ix += r
788-
# print(f'{ix=}')
789-
# [np.sum(out[i:i+r]) ]
790-
print(f"{num_vals=}")
791-
print(f"{cts=}")
792742
cts = cts.astype("float")
793-
cts /= rep(num_vals) # each divisor is the number of repeats for that index
794-
print(f"{cts=}")
743+
cts /= rep(
744+
num_repeats
745+
) # each divisor is the number of repeats for that index
795746

796747
if bins is None:
797-
print("codes: ", codes)
798-
# codes = [code[changes] for code in codes]
799-
used_ids = np.unique(ids)
800-
# codes = [code[used_ids] for code in codes]
801748
codes = [rep(level_codes) for level_codes in codes] + [val_lab]
802-
print(f"{codes=}")
803749

804750
if sort:
805751
indices = tuple(reversed(codes[:-1]))
@@ -808,52 +754,36 @@ def value_counts(
808754
) # sorts using right columns first
809755
cts = cts[sorter]
810756
codes = [code[sorter] for code in codes]
811-
print(f"{cts=}")
757+
812758
mi = MultiIndex(
813759
levels=levels, codes=codes, names=names, verify_integrity=False
814760
)
815-
# print(f'{mi=}')
816761
if is_integer_dtype(cts):
817762
cts = ensure_int64(cts)
818763
return self.obj._constructor(cts, index=mi, name=self._selection_name)
819764

820-
nbin = len(levels[-1])
821-
# print(f'{codes=}')
822-
print(len(cts), len(codes[0]), len(sorter))
823-
824765
# for compat. with libgroupby.value_counts need to ensure every
825766
# bin is present at every index level, null filled with zeros
826-
print(f"{ids=}")
767+
nbin = len(levels[-1])
827768
ncat = len(codes[0])
828-
# ncat = len(ids)
829-
print(f"{nbin=}")
830769
fout = np.zeros((ncat * nbin), dtype=float if normalize else np.int64)
831-
for i, ct in enumerate(cts):
832-
fout[ids[i] * nbin + val_lab[i]] = ct
833-
print(f"{fout=}", len(fout))
834-
770+
id = 0
771+
for i, ct in enumerate(cts): # fill in nonzero values of fout
772+
fout[id * nbin + val_lab[i]] = cts[i]
773+
id += change_ids[i]
835774
ncodes = [np.repeat(code, nbin) for code in codes]
836-
print(f"{ncodes=}")
837775
ncodes.append(np.tile(range(nbin), len(codes[0])))
838-
"""
839-
fout = cts
840-
ncodes = [rep(level_codes) for level_codes in codes] + [val_lab]
841-
"""
842-
print(f"{ncodes=}")
776+
843777
if sort:
844778
indices = tuple(reversed(ncodes[:-1]))
845-
print(f"{indices=}")
846-
# print(np.r_[[fout if ascending else -fout], indices])
847779
sorter = np.lexsort(
848780
np.r_[[fout if ascending else -fout], indices]
849781
) # sorts using right columns first
850-
# print(sorter)
851782
fout = fout[sorter]
852783
ncodes = [code[sorter] for code in ncodes]
853784
mi = MultiIndex(
854785
levels=levels, codes=ncodes, names=names, verify_integrity=False
855786
)
856-
print(f"{mi=}")
857787
if is_integer_dtype(fout):
858788
fout = ensure_int64(fout)
859789
return self.obj._constructor(fout, index=mi, name=self._selection_name)

pandas/tests/base/test_value_counts.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -191,31 +191,34 @@ def test_value_counts_bins(index_or_series):
191191
assert s.nunique() == 0
192192

193193

194-
def test_value_counts_bins_nas():
194+
@pytest.mark.parametrize("dropna", [True, False])
195+
@pytest.mark.parametrize("bins", [None, 3, [0, 1, 3, 6]])
196+
def test_value_counts_bins_nas(dropna, bins):
195197
# GH25970, handle normalizing bins with NA's properly
196198
# First test that NA's are included appropriately
197199
rand_data = np.append(
198200
np.random.randint(1, 5, 50), [np.nan] * np.random.randint(1, 20)
199201
)
200202
s = Series(rand_data)
201-
assert s.value_counts(dropna=False).index.hasnans
202-
assert not s.value_counts(dropna=True).index.hasnans
203-
assert s.value_counts(dropna=False, bins=3).index.hasnans
204-
assert not s.value_counts(dropna=True, bins=3).index.hasnans
205-
assert s.value_counts(dropna=False, bins=[0, 1, 3, 6]).index.hasnans
206-
assert not s.value_counts(dropna=True, bins=[0, 1, 3, 6]).index.hasnans
207-
208-
# then verify specific example
203+
if dropna:
204+
assert not s.value_counts(dropna=dropna, bins=bins).index.hasnans
205+
else:
206+
assert s.value_counts(dropna=dropna, bins=bins).index.hasnans
207+
208+
209+
def test_value_counts_bins_specific_na():
210+
# verify specific NA example
209211
s2 = Series([1, 2, 2, 3, 3, 3, np.nan, np.nan, 4, 5])
210212
intervals = IntervalIndex.from_breaks([0.995, 2.333, 3.667, 5.0])
211213
expected_dropna = Series([0.375, 0.375, 0.25], intervals.take([1, 0, 2]))
212-
expected_keepna_vals = np.array([0.3, 0.3, 0.2, 0.2])
213214
tm.assert_series_equal(
214215
s2.value_counts(dropna=True, normalize=True, bins=3), expected_dropna
215216
)
216-
tm.assert_numpy_array_equal(
217-
s2.value_counts(dropna=False, normalize=True, bins=3).values,
218-
expected_keepna_vals,
217+
keys = list(intervals.take([1, 0, 2]))
218+
keys.insert(2, np.nan)
219+
expected_keepna = Series([0.3, 0.3, 0.2, 0.2], keys)
220+
tm.assert_series_equal(
221+
s2.value_counts(dropna=False, normalize=True, bins=3), expected_keepna
219222
)
220223

221224

pandas/tests/groupby/test_value_counts.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import numpy as np
1010
import pytest
1111

12-
from pandas import DataFrame, Grouper, MultiIndex, Series, cut, date_range, to_datetime
12+
from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime
1313
import pandas._testing as tm
1414

1515

@@ -40,7 +40,7 @@ def seed_df(seed_nans, n, m):
4040
binned = []
4141
ids = []
4242
for seed_nans in [True, False]:
43-
for n, m in product((10, 1000), (5, 20)):
43+
for n, m in product((100, 1000), (5, 20)):
4444
df = seed_df(seed_nans, n, m)
4545
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
4646
keys = "1st", "2nd", ["1st", "2nd"]
@@ -114,19 +114,9 @@ def test_groubpy_value_counts_bins():
114114
)
115115

116116
result.sort_index(inplace=True)
117-
intervals = cut(Series([0]), bins=BINS, include_lowest=True).cat.categories
118-
# groups = [(0,5), (1,5), (2,5), (3,5), (3,6)]
119-
groups = set((v[1], v[2], i) for v in values for i in intervals)
120-
# {val[:-1]: 0 for val in values}
121-
index = product([], intervals)
122-
123-
"""index = MultiIndex.from_product(
124-
[groups, sorted(intervals)], names=("key1", "key2", "score")
125-
)"""
126117
expected = Series(
127118
[1, 0, 1, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 1], result.index, name="score"
128119
)
129-
# expected = [2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1]
130120
tm.assert_series_equal(result, expected)
131121

132122

0 commit comments

Comments
 (0)