From 68366d9194a047a5a500398c26cf65e54e0ee820 Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Wed, 30 Dec 2020 13:50:17 +1100 Subject: [PATCH 1/4] BUG: GH38672 SeriesGroupBy.value_counts for categorical Unobserved categories in Series were being dropped in value_counts, which was inconsistent with Series.value_counts --- pandas/core/groupby/generic.py | 14 +++++++++++--- pandas/tests/groupby/test_value_counts.py | 7 ++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 16b00735cf694..c8ab51fd9a309 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -42,6 +42,7 @@ ensure_int64, ensure_platform_int, is_bool, + is_categorical_dtype, is_integer_dtype, is_interval_dtype, is_numeric_dtype, @@ -681,9 +682,7 @@ def value_counts( from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - if bins is not None and not np.iterable(bins): - # scalar bins cannot be done at top level - # in a backward compatible way + def apply_series_value_counts(): return self.apply( Series.value_counts, normalize=normalize, @@ -695,6 +694,15 @@ def value_counts( ids, _, _ = self.grouper.group_info val = self.obj._values + if bins is not None: + if not np.iterable(bins): + # scalar bins cannot be done at top level + # in a backward compatible way + return apply_series_value_counts() + elif is_categorical_dtype(val): + # GH38672 + return apply_series_value_counts() + # groupby removes null keys from groupings mask = ids != -1 ids, val = ids[mask], val[mask] diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c5d454baa7e7b..73f67678f3b06 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -85,8 +85,10 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -def test_series_groupby_value_counts_with_grouper(): +@pytest.mark.parametrize("categorical", [True, False]) +def test_series_groupby_value_counts_with_grouper(categorical): # GH28479 + # GH38672 (categorical) df = DataFrame( { "Timestamp": [ @@ -102,6 +104,9 @@ def test_series_groupby_value_counts_with_grouper(): } ).drop([3]) + if categorical: + df["Food"] = df["Food"].astype("category") + df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") dfg = df.groupby(Grouper(freq="1D", key="Datetime")) From 59718b11f221897ef678f961466380791b46dd23 Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Thu, 31 Dec 2020 09:49:03 +1100 Subject: [PATCH 2/4] TST: Added separate test for GH38672 --- pandas/tests/groupby/test_value_counts.py | 45 +++++++++++++++++++---- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 73f67678f3b06..afb648d8527ca 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,7 +9,16 @@ import numpy as np import pytest -from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Grouper, + MultiIndex, + Series, + date_range, + to_datetime, +) import pandas._testing as tm @@ -85,10 +94,8 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -@pytest.mark.parametrize("categorical", [True, False]) -def test_series_groupby_value_counts_with_grouper(categorical): +def test_series_groupby_value_counts_with_grouper(): # GH28479 - # GH38672 (categorical) df = DataFrame( { "Timestamp": [ @@ -104,9 +111,6 @@ def test_series_groupby_value_counts_with_grouper(categorical): } ).drop([3]) - if categorical: - df["Food"] = df["Food"].astype("category") - df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") dfg = df.groupby(Grouper(freq="1D", key="Datetime")) @@ -116,3 +120,30 @@ def test_series_groupby_value_counts_with_grouper(categorical): expected.index.names = result.index.names tm.assert_series_equal(result, expected) + + +def test_series_groupby_value_counts_on_categorical(): + # GH38672 + + s = Series(Categorical(["a"], categories=["a", "b"])) + result = s.groupby([0]).value_counts() + + expected = Series( + data=[1, 0], + index=MultiIndex.from_arrays( + [ + [0, 0], + CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, dtype="category" + ), + ] + ), + name=0, + ) + + # Expected: + # 0 a 1 + # b 0 + # Name: 0, dtype: int64 + + tm.assert_series_equal(result, expected) From 7bfd67d6e517e371ace106794c413765881becbb Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Thu, 31 Dec 2020 12:59:27 +1100 Subject: [PATCH 3/4] CLN: reordering code for GH38796 --- pandas/core/groupby/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c8ab51fd9a309..f2899a7ca704b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -682,6 +682,9 @@ def value_counts( from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut + ids, _, _ = self.grouper.group_info + val = self.obj._values + def apply_series_value_counts(): return self.apply( Series.value_counts, @@ -691,9 +694,6 @@ def apply_series_value_counts(): bins=bins, ) - ids, _, _ = self.grouper.group_info - val = self.obj._values - if bins is not None: if not np.iterable(bins): # scalar bins cannot be done at top level From 49bdf8454c14adab175d227b64e9723cd1420be9 Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Thu, 31 Dec 2020 13:00:37 +1100 Subject: [PATCH 4/4] DOC: whatsnew 1.3 bugfix for GH38672 --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 83bff6d7bfb2d..53311e610d0eb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -280,7 +280,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) - Reshaping