Fix sorting and non-sorting

rhshadrach · rhshadrach · commit 0ae70b78a0b5 · 2023-11-12T09:50:54.000-05:00
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -101,7 +101,6 @@ class providing the base-class of operations.
 from pandas.core.arrays import (
     ArrowExtensionArray,
     BaseMaskedArray,
-    Categorical,
     ExtensionArray,
     FloatingArray,
     IntegerArray,
@@ -130,7 +129,6 @@ class providing the base-class of operations.
     GroupByNthSelector,
 )
 from pandas.core.indexes.api import (
-    CategoricalIndex,
     Index,
     MultiIndex,
     RangeIndex,
@@ -2806,18 +2804,20 @@ def _value_counts(
         result_series = cast(Series, gb.size())
         result_series.name = name
 
-        # GH-46357 Include non-observed categories
-        # of non-grouping columns regardless of `observed`
-        if any(
-            isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
-            and not grouping._observed
-            for grouping in groupings
-        ):
-            levels_list = gb.grouper.levels
-            multi_index, _ = MultiIndex.from_product(
-                levels_list, names=[ping.name for ping in groupings]
-            ).sortlevel()
-            result_series = result_series.reindex(multi_index, fill_value=0)
+        if sort:
+            # Sort the values and then resort by the main grouping
+            # TODO: HACK - sort_index gets confused if index names are integers
+            names = result_series.index.names
+            result_series.index.names = range(len(names))
+            index_level = list(range(len(self.grouper.groupings)))
+            result_series = result_series.sort_values(
+                ascending=ascending, kind="stable"
+            )
+            if self.sort:
+                result_series = result_series.sort_index(
+                    level=index_level, sort_remaining=False
+                )
+            result_series.index.names = names
 
         if normalize:
             # Normalize the results by dividing by the original group sizes.
@@ -2838,13 +2838,6 @@ def _value_counts(
             # Handle groups of non-observed categories
             result_series = result_series.fillna(0.0)
 
-        if sort:
-            # Sort the values and then resort by the main grouping
-            index_level = range(len(self.grouper.groupings))
-            result_series = result_series.sort_values(ascending=ascending).sort_index(
-                level=index_level, sort_remaining=False
-            )
-
         result: Series | DataFrame
         if self.as_index:
             result = result_series
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -690,6 +690,10 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
     def codes(self) -> npt.NDArray[np.signedinteger]:
         return self._codes_and_uniques[0]
 
+    @property
+    def uniques(self) -> ArrayLike:
+        return self._codes_and_uniques[1]
+
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         uniques: ArrayLike
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -61,6 +61,9 @@
 )
 from pandas.core.series import Series
 from pandas.core.sorting import (
+    compress_group_index,
+    decons_obs_group_ids,
+    get_group_index,
     get_group_index_sorter,
     get_indexer_dict,
 )
@@ -758,51 +761,42 @@ def ids(self) -> np.ndarray:
 
     @cache_readonly
     def result_index_and_ids(self) -> tuple[Index, np.ndarray]:
-        from pandas.core.sorting import (
-            compress_group_index,
-            decons_obs_group_ids,
-            get_group_index,
-        )
-
-        codes_and_uniques = [ping._codes_and_uniques for ping in self.groupings]
-
-        codes = [e[0] for e in codes_and_uniques]
-        levels = [Index._with_infer(e[1]) for e in codes_and_uniques]
-        for k, (ping, level) in enumerate(zip(self.groupings, levels)):
-            if ping._passed_categorical:
-                # TODO: Modify in Grouping.groups instead?
-                levels[k] = level.set_categories(ping._orig_cats)
         names = self.names
-
+        codes = [ping.codes for ping in self.groupings]
+        levels = [Index._with_infer(ping.uniques) for ping in self.groupings]
         obs = [
             ping._observed or not ping._passed_categorical for ping in self.groupings
         ]
+        # When passed a categorical grouping, keep all categories
+        for k, (ping, level) in enumerate(zip(self.groupings, levels)):
+            if ping._passed_categorical:
+                levels[k] = level.set_categories(ping._orig_cats)
 
         if len(self.groupings) == 1:
             result_index = levels[0]
             result_index.name = names[0]
-            ids = codes[0]
+            ids = codes[0].astype("intp", copy=False)
             return result_index, ids
-        elif any(obs):
-            ob_codes = [e for e, o in zip(codes, obs) if o]
-            ob_levels = [e for e, o in zip(levels, obs) if o]
-            ob_names = [e for e, o in zip(names, obs) if o]
+
+        if any(obs):
+            ob_codes = [code for code, ob in zip(codes, obs) if ob]
+            ob_levels = [level for level, ob in zip(levels, obs) if ob]
+            ob_names = [name for name, ob in zip(names, obs) if ob]
 
             shape = tuple(len(level) for level in ob_levels)
             group_index = get_group_index(ob_codes, shape, sort=True, xnull=True)
             ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort)
             ob_ids = ensure_platform_int(ob_ids)
-            ids, obs_ids = ob_ids, obs_group_ids
             ob_index_codes = decons_obs_group_ids(
-                ids, obs_ids, shape, ob_codes, xnull=True
+                ob_ids, obs_group_ids, shape, ob_codes, xnull=True
             )
-
             ob_index = MultiIndex(
                 levels=ob_levels,
                 codes=ob_index_codes,
                 names=ob_names,
                 verify_integrity=False,
             )
+
         if not all(obs):
             unob_codes = [e for e, o in zip(codes, obs) if not o]
             unob_levels = [e for e, o in zip(levels, obs) if not o]
@@ -811,7 +805,6 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]:
             shape = tuple(len(level) for level in unob_levels)
             unob_ids = get_group_index(unob_codes, shape, sort=True, xnull=True)
             unob_ids = ensure_platform_int(unob_ids)
-
             unob_index = MultiIndex.from_product(unob_levels, names=unob_names)
 
         if all(obs):
@@ -821,32 +814,35 @@ def result_index_and_ids(self) -> tuple[Index, np.ndarray]:
             result_index = unob_index
             ids = unob_ids
         else:
-            ob_indices = [k for k, e in enumerate(obs) if e]
+            # Combine unobserved and observed parts of result_index
             unob_indices = [k for k, e in enumerate(obs) if not e]
-            _, index, inverse = np.unique(
-                unob_indices + ob_indices, return_index=True, return_inverse=True
-            )
+            ob_indices = [k for k, e in enumerate(obs) if e]
             result_index_codes = np.concatenate(
                 [
                     np.tile(unob_index.codes, len(ob_index)),
                     np.repeat(ob_index.codes, len(unob_index), axis=1),
                 ],
                 axis=0,
             )
+            _, index = np.unique(unob_indices + ob_indices, return_index=True)
             result_index = MultiIndex(
-                levels=[levels[k] for k in inverse],
+                levels=list(unob_index.levels) + list(ob_index.levels),
                 codes=result_index_codes,
-                names=[names[k] for k in inverse],
+                names=list(unob_index.names) + list(ob_index.names),
             ).reorder_levels(index)
-
             ids = len(unob_index) * ob_ids + unob_ids
-            sorter = result_index.argsort()
-            result_index = result_index.take(sorter)
-            _, inverse = np.unique(sorter, return_index=True)
-            ids = inverse.take(ids)
 
-        if len(levels) == 1:
-            result_index = result_index.get_level_values(0)
+            if self._sort:
+                sorter = result_index.argsort()
+                result_index = result_index.take(sorter)
+                _, inverse = np.unique(sorter, return_index=True)
+                ids = inverse.take(ids)
+            else:
+                ids, uniques = compress_group_index(ids, sort=False)
+                taker = np.concatenate(
+                    [uniques, np.delete(np.arange(len(result_index)), uniques)]
+                )
+                result_index = result_index.take(taker)
 
         return result_index, ids
 
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
@@ -385,8 +385,8 @@ def test_against_frame_and_seriesgroupby(
     "sort, ascending, expected_rows, expected_count, expected_group_size",
     [
         (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
-        (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
-        (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
+        (True, False, [3, 0, 1, 2, 4], [2, 1, 1, 1, 1], [3, 1, 3, 1, 1]),
+        (True, True, [0, 1, 2, 4, 3], [1, 1, 1, 1, 2], [1, 3, 1, 1, 3]),
     ],
 )
 def test_compound(
@@ -617,7 +617,7 @@ def test_categorical_single_grouper_with_only_observed_categories(
         )
 
     gp = education_df.astype("category").groupby(
-        "country", as_index=as_index, observed=observed
+        "country", as_index=as_index, observed=observed, sort=True
     )
     result = gp.value_counts(normalize=normalize)
 
@@ -811,19 +811,19 @@ def test_categorical_single_grouper_observed_false(
         ("FR", "female", "high"),
         ("FR", "male", "medium"),
         ("FR", "female", "low"),
-        ("FR", "male", "high"),
         ("FR", "female", "medium"),
+        ("FR", "male", "high"),
         ("US", "female", "high"),
         ("US", "male", "low"),
-        ("US", "male", "medium"),
-        ("US", "male", "high"),
-        ("US", "female", "medium"),
         ("US", "female", "low"),
-        ("ASIA", "male", "low"),
-        ("ASIA", "male", "high"),
-        ("ASIA", "female", "medium"),
-        ("ASIA", "female", "low"),
+        ("US", "female", "medium"),
+        ("US", "male", "high"),
+        ("US", "male", "medium"),
         ("ASIA", "female", "high"),
+        ("ASIA", "female", "low"),
+        ("ASIA", "female", "medium"),
+        ("ASIA", "male", "high"),
+        ("ASIA", "male", "low"),
         ("ASIA", "male", "medium"),
     ]