45
45
ensure_platform_int ,
46
46
is_bool ,
47
47
is_integer_dtype ,
48
- is_interval_dtype ,
49
48
is_numeric_dtype ,
50
49
is_object_dtype ,
51
50
is_scalar ,
52
51
needs_i8_conversion ,
53
52
)
54
- from pandas .core .dtypes .dtypes import CategoricalDtype
55
53
from pandas .core .dtypes .missing import isna , notna
56
54
57
55
from pandas .core .aggregation import (
61
59
validate_func_kwargs ,
62
60
)
63
61
import pandas .core .algorithms as algorithms
62
+ from pandas .core .algorithms import unique
64
63
from pandas .core .base import DataError , SpecificationError
65
64
import pandas .core .common as com
66
65
from pandas .core .construction import create_series_with_explicit_dtype
78
77
import pandas .core .indexes .base as ibase
79
78
from pandas .core .internals import BlockManager , make_block
80
79
from pandas .core .series import Series
80
+ from pandas .core .sorting import compress_group_index
81
81
from pandas .core .util .numba_ import (
82
82
NUMBA_FUNC_CACHE ,
83
83
generate_numba_func ,
@@ -667,7 +667,6 @@ def value_counts(
667
667
):
668
668
669
669
from pandas .core .reshape .tile import cut
670
- from pandas .core .reshape .merge import _get_join_indexers
671
670
672
671
if bins is not None and not np .iterable (bins ):
673
672
# scalar bins cannot be done at top level
@@ -679,14 +678,9 @@ def value_counts(
679
678
ascending = ascending ,
680
679
bins = bins ,
681
680
)
682
- keys = [k for k in self .groups ]
683
- # print(f'{self.groups=}')
684
681
ids , _ , _ = self .grouper .group_info
685
- # print(f'{ids=}')
686
682
val = self .obj ._values
687
- print (f"{ keys = } " )
688
683
codes = self .grouper .reconstructed_codes # this will track the groups
689
- print ("codes: " , codes )
690
684
691
685
# groupby removes null keys from groupings
692
686
mask = ids != - 1
@@ -695,111 +689,63 @@ def value_counts(
695
689
mask = ~ isna (val )
696
690
if not mask .all ():
697
691
ids , val = ids [mask ], val [mask ]
698
- # codes = [code[mask] for code in codes]
699
692
700
- print (f"{ ids = } " )
701
- print (f"{ val = } " )
702
-
703
- print (f"{ bins = } " )
704
693
if bins is None :
705
694
val_lab , val_lev = algorithms .factorize (val , sort = True , dropna = dropna )
706
- print (f"{ val_lab = } " )
707
695
else :
708
696
# val_lab is a Categorical with categories an IntervalIndex
709
- print (f"{ Series (val )= } " )
710
697
val_lab = cut (Series (val ), bins , include_lowest = True )
711
- # cut excludes NaN from its categories, so need to manually add
712
- print (f"{ val_lab = } " )
713
- print ((not dropna ) and (val_lab .hasnans ))
714
- """if (not dropna) and (val_lab.hasnans):
715
- # val_lab =
716
- cat_nan = CategoricalDtype(val_lab.cat.add_categories('NaN').cat.categories)
717
- print(cat_nan)
718
- val_lab = val_lab.astype(cat_nan).fillna('NaN')
719
- """
720
- print (f"{ val_lab = } " )
721
698
val_lev = val_lab .cat .categories
722
699
val_lab = val_lab .cat .codes .values
723
- print (f"{ val_lab = } " )
724
- if dropna :
725
- included = val_lab != - 1
726
- ids , val_lab = ids [included ], val_lab [included ]
727
-
728
- # print('1st val_lab: ', val_lab.cat.codes)
729
- # llab = lambda val_lab, inc: val_lab[inc]._multiindex.codes[-1]
730
- print (f"{ val_lev = } " )
731
- if is_interval_dtype (val_lab .dtype ):
732
- # TODO: should we do this inside II?
733
- sorter = np .lexsort ((val_lab .right , val_lab .left , ids ))
734
- else :
735
- sorter = np .lexsort ((val_lab , ids ))
700
+
701
+ if dropna :
702
+ included = val_lab != - 1
703
+ ids , val_lab = ids [included ], val_lab [included ]
704
+
705
+ sorter = np .lexsort ((val_lab , ids ))
736
706
ids , val_lab = ids [sorter ], val_lab [sorter ]
707
+ used_ids = unique (ids )
708
+ if max (used_ids ) >= len (
709
+ codes [0 ]
710
+ ): # this means we had something skipped from the start
711
+ used_ids = compress_group_index (used_ids )[0 ]
712
+ codes = [code [used_ids ] for code in codes ] # drop what was taken out for n/a
737
713
738
- print ("ids: " , ids )
739
- print (f"{ val_lab = } " )
740
- # val_lab = val_lab.values
741
- # print(f'{val_lab=}')
742
714
# group boundaries are where group ids change
743
715
# new values are where sorted labels change
744
716
change_ids = ids [1 :] != ids [:- 1 ]
745
- print ((val_lab [1 :] != val_lab [:- 1 ]))
746
717
changes = np .logical_or (change_ids , (val_lab [1 :] != val_lab [:- 1 ]))
747
- """
748
- changes = [(ids[i] != ids[i+1]) or (val_lab[i] != val_lab[i+1])
749
- for i in range(len(ids)-1)] #((ids[1:] != ids[:-1]) or (val_lab[1:] != val_lab[:-1]))
750
- """
751
- print (f"{ changes = } " )
752
- print (np .diff (np .nonzero (changes ), append = len (changes ))[0 ])
753
718
changes = np .r_ [True , changes ]
754
- cts = np .diff (np .nonzero (np .r_ [changes , True ]))[0 ] # , append=len(changes))[0]
755
- print (f"{ cts = } " )
756
719
val_lab = val_lab [changes ]
757
720
ids = ids [changes ]
758
- print ("ids: " , ids )
759
-
760
- change_ids = (
761
- ids [1 :] != ids [:- 1 ]
762
- ) # need to update now that we removed full repeats
763
- # num_id_rep = np.diff(np.nonzero(np.r_[True, chan]))
764
- print (f"{ change_ids = } " )
765
- print (f"{ val_lab = } " )
721
+ cts = np .diff (np .nonzero (np .r_ [changes , True ]))[0 ]
766
722
723
+ idx = np .r_ [0 , 1 + np .nonzero (change_ids )[0 ]]
724
+ rep = partial (np .repeat , repeats = np .add .reduceat (changes , idx ))
767
725
num_repeats = np .diff (np .nonzero (np .r_ [True , change_ids , True ]))[0 ]
768
- rep = partial (np .repeat , repeats = num_repeats )
769
- print (f"{ rep = } " )
726
+
727
+ change_ids = np .r_ [ # need to update now that we removed full repeats
728
+ ids [1 :] != ids [:- 1 ], True
729
+ ]
730
+
770
731
if (not dropna ) and (- 1 in val_lab ):
732
+ # in this case we need to explicitly add NaN as a level
771
733
val_lev = np .r_ [Index ([np .nan ]), val_lev ]
772
734
val_lab += 1
735
+
773
736
levels = [ping .group_index for ping in self .grouper .groupings ] + [
774
737
Index (val_lev )
775
738
]
776
- print (f"{ levels = } " )
777
739
names = self .grouper .names + [self ._selection_name ]
778
- print (f"{ names = } " )
779
740
780
741
if normalize :
781
- num_vals = []
782
- ix = 0
783
- print (f"{ num_repeats = } " )
784
- for i , r in enumerate (num_repeats ):
785
- num_vals .append (np .sum (cts [ix : ix + r ]))
786
- # print(out[ix:ix+r])
787
- ix += r
788
- # print(f'{ix=}')
789
- # [np.sum(out[i:i+r]) ]
790
- print (f"{ num_vals = } " )
791
- print (f"{ cts = } " )
792
742
cts = cts .astype ("float" )
793
- cts /= rep (num_vals ) # each divisor is the number of repeats for that index
794
- print (f"{ cts = } " )
743
+ cts /= rep (
744
+ num_repeats
745
+ ) # each divisor is the number of repeats for that index
795
746
796
747
if bins is None :
797
- print ("codes: " , codes )
798
- # codes = [code[changes] for code in codes]
799
- used_ids = np .unique (ids )
800
- # codes = [code[used_ids] for code in codes]
801
748
codes = [rep (level_codes ) for level_codes in codes ] + [val_lab ]
802
- print (f"{ codes = } " )
803
749
804
750
if sort :
805
751
indices = tuple (reversed (codes [:- 1 ]))
@@ -808,52 +754,36 @@ def value_counts(
808
754
) # sorts using right columns first
809
755
cts = cts [sorter ]
810
756
codes = [code [sorter ] for code in codes ]
811
- print ( f" { cts = } " )
757
+
812
758
mi = MultiIndex (
813
759
levels = levels , codes = codes , names = names , verify_integrity = False
814
760
)
815
- # print(f'{mi=}')
816
761
if is_integer_dtype (cts ):
817
762
cts = ensure_int64 (cts )
818
763
return self .obj ._constructor (cts , index = mi , name = self ._selection_name )
819
764
820
- nbin = len (levels [- 1 ])
821
- # print(f'{codes=}')
822
- print (len (cts ), len (codes [0 ]), len (sorter ))
823
-
824
765
# for compat. with libgroupby.value_counts need to ensure every
825
766
# bin is present at every index level, null filled with zeros
826
- print ( f" { ids = } " )
767
+ nbin = len ( levels [ - 1 ] )
827
768
ncat = len (codes [0 ])
828
- # ncat = len(ids)
829
- print (f"{ nbin = } " )
830
769
fout = np .zeros ((ncat * nbin ), dtype = float if normalize else np .int64 )
831
- for i , ct in enumerate ( cts ):
832
- fout [ ids [ i ] * nbin + val_lab [ i ]] = ct
833
- print ( f" { fout = } " , len ( fout ))
834
-
770
+ id = 0
771
+ for i , ct in enumerate ( cts ): # fill in nonzero values of fout
772
+ fout [ id * nbin + val_lab [ i ]] = cts [ i ]
773
+ id += change_ids [ i ]
835
774
ncodes = [np .repeat (code , nbin ) for code in codes ]
836
- print (f"{ ncodes = } " )
837
775
ncodes .append (np .tile (range (nbin ), len (codes [0 ])))
838
- """
839
- fout = cts
840
- ncodes = [rep(level_codes) for level_codes in codes] + [val_lab]
841
- """
842
- print (f"{ ncodes = } " )
776
+
843
777
if sort :
844
778
indices = tuple (reversed (ncodes [:- 1 ]))
845
- print (f"{ indices = } " )
846
- # print(np.r_[[fout if ascending else -fout], indices])
847
779
sorter = np .lexsort (
848
780
np .r_ [[fout if ascending else - fout ], indices ]
849
781
) # sorts using right columns first
850
- # print(sorter)
851
782
fout = fout [sorter ]
852
783
ncodes = [code [sorter ] for code in ncodes ]
853
784
mi = MultiIndex (
854
785
levels = levels , codes = ncodes , names = names , verify_integrity = False
855
786
)
856
- print (f"{ mi = } " )
857
787
if is_integer_dtype (fout ):
858
788
fout = ensure_int64 (fout )
859
789
return self .obj ._constructor (fout , index = mi , name = self ._selection_name )
0 commit comments