Skip to content

Commit 0ae70b7

Browse files
committed
Fix sorting and non-sorting
1 parent dbdec9f commit 0ae70b7

File tree

4 files changed

+62
-69
lines changed

4 files changed

+62
-69
lines changed

pandas/core/groupby/groupby.py

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ class providing the base-class of operations.
101101
from pandas.core.arrays import (
102102
ArrowExtensionArray,
103103
BaseMaskedArray,
104-
Categorical,
105104
ExtensionArray,
106105
FloatingArray,
107106
IntegerArray,
@@ -130,7 +129,6 @@ class providing the base-class of operations.
130129
GroupByNthSelector,
131130
)
132131
from pandas.core.indexes.api import (
133-
CategoricalIndex,
134132
Index,
135133
MultiIndex,
136134
RangeIndex,
@@ -2806,18 +2804,20 @@ def _value_counts(
28062804
result_series = cast(Series, gb.size())
28072805
result_series.name = name
28082806

2809-
# GH-46357 Include non-observed categories
2810-
# of non-grouping columns regardless of `observed`
2811-
if any(
2812-
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
2813-
and not grouping._observed
2814-
for grouping in groupings
2815-
):
2816-
levels_list = gb.grouper.levels
2817-
multi_index, _ = MultiIndex.from_product(
2818-
levels_list, names=[ping.name for ping in groupings]
2819-
).sortlevel()
2820-
result_series = result_series.reindex(multi_index, fill_value=0)
2807+
if sort:
2808+
# Sort the values and then resort by the main grouping
2809+
# TODO: HACK - sort_index gets confused if index names are integers
2810+
names = result_series.index.names
2811+
result_series.index.names = range(len(names))
2812+
index_level = list(range(len(self.grouper.groupings)))
2813+
result_series = result_series.sort_values(
2814+
ascending=ascending, kind="stable"
2815+
)
2816+
if self.sort:
2817+
result_series = result_series.sort_index(
2818+
level=index_level, sort_remaining=False
2819+
)
2820+
result_series.index.names = names
28212821

28222822
if normalize:
28232823
# Normalize the results by dividing by the original group sizes.
@@ -2838,13 +2838,6 @@ def _value_counts(
28382838
# Handle groups of non-observed categories
28392839
result_series = result_series.fillna(0.0)
28402840

2841-
if sort:
2842-
# Sort the values and then resort by the main grouping
2843-
index_level = range(len(self.grouper.groupings))
2844-
result_series = result_series.sort_values(ascending=ascending).sort_index(
2845-
level=index_level, sort_remaining=False
2846-
)
2847-
28482841
result: Series | DataFrame
28492842
if self.as_index:
28502843
result = result_series

pandas/core/groupby/grouper.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
690690
def codes(self) -> npt.NDArray[np.signedinteger]:
691691
return self._codes_and_uniques[0]
692692

693+
@property
694+
def uniques(self) -> ArrayLike:
695+
return self._codes_and_uniques[1]
696+
693697
@cache_readonly
694698
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
695699
uniques: ArrayLike

pandas/core/groupby/ops.py

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@
6161
)
6262
from pandas.core.series import Series
6363
from pandas.core.sorting import (
64+
compress_group_index,
65+
decons_obs_group_ids,
66+
get_group_index,
6467
get_group_index_sorter,
6568
get_indexer_dict,
6669
)
@@ -758,51 +761,42 @@ def ids(self) -> np.ndarray:
758761

759762
@cache_readonly
760763
def result_index_and_ids(self) -> tuple[Index, np.ndarray]:
761-
from pandas.core.sorting import (
762-
compress_group_index,
763-
decons_obs_group_ids,
764-
get_group_index,
765-
)
766-
767-
codes_and_uniques = [ping._codes_and_uniques for ping in self.groupings]
768-
769-
codes = [e[0] for e in codes_and_uniques]
770-
levels = [Index._with_infer(e[1]) for e in codes_and_uniques]
771-
for k, (ping, level) in enumerate(zip(self.groupings, levels)):
772-
if ping._passed_categorical:
773-
# TODO: Modify in Grouping.groups instead?
774-
levels[k] = level.set_categories(ping._orig_cats)
775764
names = self.names
776-
765+
codes = [ping.codes for ping in self.groupings]
766+
levels = [Index._with_infer(ping.uniques) for ping in self.groupings]
777767
obs = [
778768
ping._observed or not ping._passed_categorical for ping in self.groupings
779769
]
770+
# When passed a categorical grouping, keep all categories
771+
for k, (ping, level) in enumerate(zip(self.groupings, levels)):
772+
if ping._passed_categorical:
773+
levels[k] = level.set_categories(ping._orig_cats)
780774

781775
if len(self.groupings) == 1:
782776
result_index = levels[0]
783777
result_index.name = names[0]
784-
ids = codes[0]
778+
ids = codes[0].astype("intp", copy=False)
785779
return result_index, ids
786-
elif any(obs):
787-
ob_codes = [e for e, o in zip(codes, obs) if o]
788-
ob_levels = [e for e, o in zip(levels, obs) if o]
789-
ob_names = [e for e, o in zip(names, obs) if o]
780+
781+
if any(obs):
782+
ob_codes = [code for code, ob in zip(codes, obs) if ob]
783+
ob_levels = [level for level, ob in zip(levels, obs) if ob]
784+
ob_names = [name for name, ob in zip(names, obs) if ob]
790785

791786
shape = tuple(len(level) for level in ob_levels)
792787
group_index = get_group_index(ob_codes, shape, sort=True, xnull=True)
793788
ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort)
794789
ob_ids = ensure_platform_int(ob_ids)
795-
ids, obs_ids = ob_ids, obs_group_ids
796790
ob_index_codes = decons_obs_group_ids(
797-
ids, obs_ids, shape, ob_codes, xnull=True
791+
ob_ids, obs_group_ids, shape, ob_codes, xnull=True
798792
)
799-
800793
ob_index = MultiIndex(
801794
levels=ob_levels,
802795
codes=ob_index_codes,
803796
names=ob_names,
804797
verify_integrity=False,
805798
)
799+
806800
if not all(obs):
807801
unob_codes = [e for e, o in zip(codes, obs) if not o]
808802
unob_levels = [e for e, o in zip(levels, obs) if not o]
@@ -811,7 +805,6 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]:
811805
shape = tuple(len(level) for level in unob_levels)
812806
unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True)
813807
unob_ids = ensure_platform_int(unob_ids)
814-
815808
unob_index = MultiIndex.from_product(unob_levels, names=unob_names)
816809

817810
if all(obs):
@@ -821,32 +814,35 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]:
821814
result_index = unob_index
822815
ids = unob_ids
823816
else:
824-
ob_indices = [k for k, e in enumerate(obs) if e]
817+
# Combine unobserved and observed parts of result_index
825818
unob_indices = [k for k, e in enumerate(obs) if not e]
826-
_, index, inverse = np.unique(
827-
unob_indices + ob_indices, return_index=True, return_inverse=True
828-
)
819+
ob_indices = [k for k, e in enumerate(obs) if e]
829820
result_index_codes = np.concatenate(
830821
[
831822
np.tile(unob_index.codes, len(ob_index)),
832823
np.repeat(ob_index.codes, len(unob_index), axis=1),
833824
],
834825
axis=0,
835826
)
827+
_, index = np.unique(unob_indices + ob_indices, return_index=True)
836828
result_index = MultiIndex(
837-
levels=[levels[k] for k in inverse],
829+
levels=list(unob_index.levels) + list(ob_index.levels),
838830
codes=result_index_codes,
839-
names=[names[k] for k in inverse],
831+
names=list(unob_index.names) + list(ob_index.names),
840832
).reorder_levels(index)
841-
842833
ids = len(unob_index) * ob_ids + unob_ids
843-
sorter = result_index.argsort()
844-
result_index = result_index.take(sorter)
845-
_, inverse = np.unique(sorter, return_index=True)
846-
ids = inverse.take(ids)
847834

848-
if len(levels) == 1:
849-
result_index = result_index.get_level_values(0)
835+
if self._sort:
836+
sorter = result_index.argsort()
837+
result_index = result_index.take(sorter)
838+
_, inverse = np.unique(sorter, return_index=True)
839+
ids = inverse.take(ids)
840+
else:
841+
ids, uniques = compress_group_index(ids, sort=False)
842+
taker = np.concatenate(
843+
[uniques, np.delete(np.arange(len(result_index)), uniques)]
844+
)
845+
result_index = result_index.take(taker)
850846

851847
return result_index, ids
852848

pandas/tests/groupby/methods/test_value_counts.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby(
385385
"sort, ascending, expected_rows, expected_count, expected_group_size",
386386
[
387387
(False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
388-
(True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
389-
(True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
388+
(True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
389+
(True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
390390
],
391391
)
392392
def test_compound(
@@ -617,7 +617,7 @@ def test_categorical_single_grouper_with_only_observed_categories(
617617
)
618618

619619
gp = education_df.astype("category").groupby(
620-
"country", as_index=as_index, observed=observed
620+
"country", as_index=as_index, observed=observed, sort=True
621621
)
622622
result = gp.value_counts(normalize=normalize)
623623

@@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false(
811811
("FR", "female", "high"),
812812
("FR", "male", "medium"),
813813
("FR", "female", "low"),
814-
("FR", "male", "high"),
815814
("FR", "female", "medium"),
815+
("FR", "male", "high"),
816816
("US", "female", "high"),
817817
("US", "male", "low"),
818-
("US", "male", "medium"),
819-
("US", "male", "high"),
820-
("US", "female", "medium"),
821818
("US", "female", "low"),
822-
("ASIA", "male", "low"),
823-
("ASIA", "male", "high"),
824-
("ASIA", "female", "medium"),
825-
("ASIA", "female", "low"),
819+
("US", "female", "medium"),
820+
("US", "male", "high"),
821+
("US", "male", "medium"),
826822
("ASIA", "female", "high"),
823+
("ASIA", "female", "low"),
824+
("ASIA", "female", "medium"),
825+
("ASIA", "male", "high"),
826+
("ASIA", "male", "low"),
827827
("ASIA", "male", "medium"),
828828
]
829829

0 commit comments

Comments
 (0)