51
51
is_scalar ,
52
52
needs_i8_conversion ,
53
53
)
54
+ from pandas .core .dtypes .dtypes import CategoricalDtype
54
55
from pandas .core .dtypes .missing import isna , notna
55
56
56
57
from pandas .core .aggregation import (
@@ -664,16 +665,7 @@ def describe(self, **kwargs):
664
665
def value_counts (
665
666
self , normalize = False , sort = True , ascending = False , bins = None , dropna = True
666
667
):
667
- return self .apply (
668
- Series .value_counts ,
669
- normalize = normalize ,
670
- sort = sort ,
671
- ascending = ascending ,
672
- bins = bins ,
673
- dropna = dropna ,
674
- )
675
668
676
- """
677
669
from pandas .core .reshape .tile import cut
678
670
from pandas .core .reshape .merge import _get_join_indexers
679
671
@@ -687,115 +679,184 @@ def value_counts(
687
679
ascending = ascending ,
688
680
bins = bins ,
689
681
)
690
-
682
+ keys = [k for k in self .groups ]
683
+ # print(f'{self.groups=}')
691
684
ids , _ , _ = self .grouper .group_info
685
+ # print(f'{ids=}')
692
686
val = self .obj ._values
687
+ print (f"{ keys = } " )
688
+ codes = self .grouper .reconstructed_codes # this will track the groups
689
+ print ("codes: " , codes )
693
690
694
691
# groupby removes null keys from groupings
695
692
mask = ids != - 1
696
693
ids , val = ids [mask ], val [mask ]
694
+ if dropna :
695
+ mask = ~ np .isnan (val )
696
+ if not mask .all ():
697
+ ids , val = ids [mask ], val [mask ]
698
+ # codes = [code[mask] for code in codes]
697
699
700
+ print (f"{ ids = } " )
701
+ print (f"{ val = } " )
702
+
703
+ print (f"{ bins = } " )
698
704
if bins is None :
699
- lab, lev = algorithms.factorize(val, sort=True)
700
- llab = lambda lab, inc: lab[inc]
705
+ val_lab , val_lev = algorithms .factorize (val , sort = True , dropna = dropna )
706
+ print ( f" { val_lab = } " )
701
707
else :
708
+ # val_lab is a Categorical with categories an IntervalIndex
709
+ print (f"{ Series (val )= } " )
710
+ val_lab = cut (Series (val ), bins , include_lowest = True )
711
+ # cut excludes NaN from its categories, so need to manually add
712
+ print (f"{ val_lab = } " )
713
+ print ((not dropna ) and (val_lab .hasnans ))
714
+ """if (not dropna) and (val_lab.hasnans):
715
+ # val_lab =
716
+ cat_nan = CategoricalDtype(val_lab.cat.add_categories('NaN').cat.categories)
717
+ print(cat_nan)
718
+ val_lab = val_lab.astype(cat_nan).fillna('NaN')
719
+ """
720
+ print (f"{ val_lab = } " )
721
+ val_lev = val_lab .cat .categories
722
+ val_lab = val_lab .cat .codes .values
723
+ print (f"{ val_lab = } " )
724
+ if dropna :
725
+ included = val_lab != - 1
726
+ ids , val_lab = ids [included ], val_lab [included ]
702
727
703
- # lab is a Categorical with categories an IntervalIndex
704
- lab = cut(Series(val), bins, include_lowest=True)
705
- lev = lab.cat.categories
706
- lab = lev.take(lab.cat.codes)
707
- llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
708
-
709
- if is_interval_dtype(lab.dtype):
728
+ # print('1st val_lab: ', val_lab.cat.codes)
729
+ # llab = lambda val_lab, inc: val_lab[inc]._multiindex.codes[-1]
730
+ print (f"{ val_lev = } " )
731
+ if is_interval_dtype (val_lab .dtype ):
710
732
# TODO: should we do this inside II?
711
- sorter = np.lexsort((lab.left, lab.right , ids))
733
+ sorter = np .lexsort ((val_lab . right , val_lab . left , ids ))
712
734
else :
713
- sorter = np.lexsort((lab, ids))
714
-
715
- ids, lab = ids[sorter], lab[sorter]
735
+ sorter = np .lexsort ((val_lab , ids ))
736
+ ids , val_lab = ids [sorter ], val_lab [sorter ]
716
737
738
+ print ("ids: " , ids )
739
+ print (f"{ val_lab = } " )
740
+ # val_lab = val_lab.values
741
+ # print(f'{val_lab=}')
717
742
# group boundaries are where group ids change
718
- idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
719
-
720
743
# new values are where sorted labels change
721
- lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
722
- inc = np.r_[True, lchanges]
723
- inc[idx] = True # group boundaries are also new values
724
- out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
725
-
726
- # num. of times each group should be repeated
727
- rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
728
-
729
- # multi-index components
730
- codes = self.grouper.reconstructed_codes
731
- codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
732
- levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
744
+ change_ids = ids [1 :] != ids [:- 1 ]
745
+ print ((val_lab [1 :] != val_lab [:- 1 ]))
746
+ changes = np .logical_or (change_ids , (val_lab [1 :] != val_lab [:- 1 ]))
747
+ """
748
+ changes = [(ids[i] != ids[i+1]) or (val_lab[i] != val_lab[i+1])
749
+ for i in range(len(ids)-1)] #((ids[1:] != ids[:-1]) or (val_lab[1:] != val_lab[:-1]))
750
+ """
751
+ print (f"{ changes = } " )
752
+ print (np .diff (np .nonzero (changes ), append = len (changes ))[0 ])
753
+ changes = np .r_ [True , changes ]
754
+ cts = np .diff (np .nonzero (np .r_ [changes , True ]))[0 ] # , append=len(changes))[0]
755
+ print (f"{ cts = } " )
756
+ val_lab = val_lab [changes ]
757
+ ids = ids [changes ]
758
+ print ("ids: " , ids )
759
+
760
+ change_ids = (
761
+ ids [1 :] != ids [:- 1 ]
762
+ ) # need to update now that we removed full repeats
763
+ # num_id_rep = np.diff(np.nonzero(np.r_[True, chan]))
764
+ print (f"{ change_ids = } " )
765
+ print (f"{ val_lab = } " )
766
+
767
+ num_repeats = np .diff (np .nonzero (np .r_ [True , change_ids , True ]))[0 ]
768
+ rep = partial (np .repeat , repeats = num_repeats )
769
+ print (f"{ rep = } " )
770
+ if (not dropna ) and (- 1 in val_lab ):
771
+ val_lev = np .r_ [Index ([np .nan ]), val_lev ]
772
+ val_lab += 1
773
+ levels = [ping .group_index for ping in self .grouper .groupings ] + [
774
+ Index (val_lev )
775
+ ]
776
+ print (f"{ levels = } " )
733
777
names = self .grouper .names + [self ._selection_name ]
734
-
735
- if dropna:
736
- mask = codes[-1] != -1
737
- if mask.all():
738
- dropna = False
739
- else:
740
- out, codes = out[mask], [level_codes[mask] for level_codes in codes]
778
+ print (f"{ names = } " )
741
779
742
780
if normalize :
743
- out = out.astype("float")
744
- d = np.diff(np.r_[idx, len(ids)])
745
- if dropna:
746
- m = ids[lab == -1]
747
- np.add.at(d, m, -1 )
748
- acc = rep(d)[mask]
749
- else:
750
- acc = rep(d )
751
- out /= acc
752
-
753
- if sort and bins is None:
754
- cat = ids[inc][mask] if dropna else ids[inc]
755
- sorter = np.lexsort((out if ascending else -out, cat))
756
- out, codes[-1] = out[sorter], codes[-1][sorter]
781
+ num_vals = []
782
+ ix = 0
783
+ print ( f" { num_repeats = } " )
784
+ for i , r in enumerate ( num_repeats ):
785
+ num_vals . append ( np .sum ( cts [ ix : ix + r ]) )
786
+ # print(out[ix:ix+r])
787
+ ix += r
788
+ # print(f'{ix=}' )
789
+ # [np.sum( out[i:i+r]) ]
790
+ print ( f" { num_vals = } " )
791
+ print ( f" { cts = } " )
792
+ cts = cts . astype ( "float" )
793
+ cts /= rep ( num_vals ) # each divisor is the number of repeats for that index
794
+ print ( f" { cts = } " )
757
795
758
796
if bins is None :
797
+ print ("codes: " , codes )
798
+ # codes = [code[changes] for code in codes]
799
+ used_ids = np .unique (ids )
800
+ codes = [code [used_ids ] for code in codes ]
801
+ codes = [rep (level_codes ) for level_codes in codes ] + [val_lab ]
802
+ print (f"{ codes = } " )
803
+
804
+ if sort :
805
+ indices = tuple (reversed (codes [:- 1 ]))
806
+ sorter = np .lexsort (
807
+ np .r_ [[val_lab ], [cts if ascending else - cts ], indices ]
808
+ ) # sorts using right columns first
809
+ cts = cts [sorter ]
810
+ codes = [code [sorter ] for code in codes ]
811
+ print (f"{ cts = } " )
759
812
mi = MultiIndex (
760
813
levels = levels , codes = codes , names = names , verify_integrity = False
761
814
)
815
+ # print(f'{mi=}')
816
+ if is_integer_dtype (cts ):
817
+ cts = ensure_int64 (cts )
818
+ return self .obj ._constructor (cts , index = mi , name = self ._selection_name )
762
819
763
- if is_integer_dtype(out):
764
- out = ensure_int64(out )
765
- return self.obj._constructor(out, index=mi, name=self._selection_name )
820
+ nbin = len ( levels [ - 1 ])
821
+ # print(f'{codes=}' )
822
+ print ( len ( cts ), len ( codes [ 0 ]), len ( sorter ) )
766
823
767
824
# for compat. with libgroupby.value_counts need to ensure every
768
825
# bin is present at every index level, null filled with zeros
769
- diff = np.zeros(len(out), dtype="bool")
770
- for level_codes in codes[:-1]:
771
- diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
772
-
773
- ncat, nbin = diff.sum(), len(levels[-1])
774
-
775
- left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
776
-
777
- right = [diff.cumsum() - 1, codes[-1]]
778
-
779
- _, idx = _get_join_indexers(left, right, sort=False, how="left")
780
- out = np.where(idx != -1, out[idx], 0)
781
-
826
+ print (f"{ ids = } " )
827
+ ncat = len (codes [0 ])
828
+ # ncat = len(ids)
829
+ print (f"{ nbin = } " )
830
+ fout = np .zeros ((ncat * nbin ), dtype = float if normalize else np .int64 )
831
+ for i , ct in enumerate (cts ):
832
+ fout [ids [i ] * nbin + val_lab [i ]] = ct
833
+ print (f"{ fout = } " , len (fout ))
834
+
835
+ ncodes = [np .repeat (code , nbin ) for code in codes ]
836
+ print (f"{ ncodes = } " )
837
+ ncodes .append (np .tile (range (nbin ), len (codes [0 ])))
838
+ """
839
+ fout = cts
840
+ ncodes = [rep(level_codes) for level_codes in codes] + [val_lab]
841
+ """
842
+ print (f"{ ncodes = } " )
782
843
if sort :
783
- sorter = np.lexsort((out if ascending else -out, left[0 ]))
784
- out, left[-1] = out[sorter], left[-1][sorter]
785
-
786
- # build the multi-index w/ full levels
787
- def build_codes(lev_codes: np.ndarray) -> np.ndarray:
788
- return np.repeat(lev_codes[diff], nbin)
789
-
790
- codes = [build_codes(lev_codes) for lev_codes in codes[:-1] ]
791
- codes.append(left[-1])
792
-
793
- mi = MultiIndex( levels=levels, codes=codes , names=names, verify_integrity=False)
794
-
795
- if is_integer_dtype(out):
796
- out = ensure_int64(out)
797
- return self.obj._constructor(out, index=mi, name=self._selection_name )
798
- """
844
+ indices = tuple ( reversed ( ncodes [: - 1 ]))
845
+ print ( f" { indices = } " )
846
+ # print(np.r_[[fout if ascending else -fout], indices])
847
+ sorter = np . lexsort (
848
+ np .r_ [[ fout if ascending else - fout ], indices ]
849
+ ) # sorts using right columns first
850
+ # print(sorter)
851
+ fout = fout [ sorter ]
852
+ ncodes = [ code [ sorter ] for code in ncodes ]
853
+ mi = MultiIndex (
854
+ levels = levels , codes = ncodes , names = names , verify_integrity = False
855
+ )
856
+ print ( f" { mi = } " )
857
+ if is_integer_dtype ( fout ):
858
+ fout = ensure_int64 ( fout )
859
+ return self . obj . _constructor ( fout , index = mi , name = self . _selection_name )
799
860
800
861
def count (self ) -> Series :
801
862
"""
0 commit comments