From 800a2c9a092bd958d34d17d90773cd2c40a29d59 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 18 Nov 2019 15:13:18 +0100 Subject: [PATCH 01/17] TST: Added failing test for GH17605 --- pandas/tests/groupby/test_categorical.py | 38 +++++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 663e03aa1bc81..645bce59af47f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -4,19 +4,11 @@ import numpy as np import pytest -from pandas.compat import PY37 - import pandas as pd -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - Index, - MultiIndex, - Series, - qcut, -) import pandas.util.testing as tm +from pandas import (Categorical, CategoricalIndex, DataFrame, Index, + MultiIndex, Series, qcut) +from pandas.compat import PY37 def cartesian_product_for_groupers(result, args, names): @@ -1252,3 +1244,27 @@ def test_get_nonexistent_category(): {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) ) + + +@pytest.mark.parametrize("aggregation", [ + "sum", + "mean", + "min", + "count", +]) +def test_series_groupby_on_2_categoricals_unobserved(aggregation): + # GH 17605 + df = pd.DataFrame({ + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4 + }) + + # Expect 1 observation for each combination of categories + expected_length = len(df["cat_1"].cat.categories) * len(df["cat_2"].cat.categories) + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + agg = getattr(series_groupby, aggregation) + result = agg() + + assert len(result) == expected_length From 189b63e6f80475dec379deecebfd1917903fd3b8 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 18 Nov 2019 16:26:37 +0100 Subject: [PATCH 02/17] TST; Made failing test more complete (GH17605) --- pandas/tests/groupby/test_categorical.py | 32 +++++++++++++----------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 645bce59af47f..fab3c1cee9401 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1246,25 +1246,27 @@ def test_get_nonexistent_category(): ) -@pytest.mark.parametrize("aggregation", [ - "sum", - "mean", - "min", - "count", -]) -def test_series_groupby_on_2_categoricals_unobserved(aggregation): +@pytest.mark.parametrize("aggregation", ["sum", "mean", "min", "count"]) +@pytest.mark.parametrize("observed", [True, False]) +def test_series_groupby_on_2_categoricals_unobserved(aggregation: str, observed: bool): # GH 17605 - df = pd.DataFrame({ - "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), - "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), - "value": [0.1] * 4 - }) + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4, + } + ) # Expect 1 observation for each combination of categories - expected_length = len(df["cat_1"].cat.categories) * len(df["cat_2"].cat.categories) + if observed: + expected_length = 4 + else: + expected_length = len(df["cat_1"].cat.categories) * len( + df["cat_2"].cat.categories + ) - series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] agg = getattr(series_groupby, aggregation) result = agg() - assert len(result) == expected_length From 002150e0d5399a2ee7dc500336226ffef18edba5 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 18 Nov 2019 16:27:43 +0100 Subject: [PATCH 03/17] BUG: Fix SeriesGroupBy.count() on categoricals when observed=False (GH17605) --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6376dbefcf435..903ba447c910c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -721,12 +721,13 @@ def count(self) -> Series: minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series( + result = Series( out, index=self.grouper.result_index, name=self._selection_name, dtype="int64", ) + return self._reindex_output(result) def _apply_to_column_groupbys(self, func): """ return a pass thru """ From 51ceacee7c018633c1ef94ced0f9306e9cfe2271 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 18 Nov 2019 16:31:56 +0100 Subject: [PATCH 04/17] CLN: Fixed import style --- pandas/tests/groupby/test_categorical.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fab3c1cee9401..c93c2b87e9ed7 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -6,8 +6,15 @@ import pandas as pd import pandas.util.testing as tm -from pandas import (Categorical, CategoricalIndex, DataFrame, Index, - MultiIndex, Series, qcut) +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + qcut, +) from pandas.compat import PY37 From 9bf02972481f8789e7c7ec65dcf1c368172ff711 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 18 Nov 2019 16:37:26 +0100 Subject: [PATCH 05/17] CLN: Corrected isort --- pandas/tests/groupby/test_categorical.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c93c2b87e9ed7..8931a42bd8fdd 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -4,8 +4,9 @@ import numpy as np import pytest +from pandas.compat import PY37 + import pandas as pd -import pandas.util.testing as tm from pandas import ( Categorical, CategoricalIndex, @@ -15,7 +16,7 @@ Series, qcut, ) -from pandas.compat import PY37 +import pandas.util.testing as tm def cartesian_product_for_groupers(result, args, names): From 9a99af16425317a7d7a74e14a351e281d6a936c3 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Mon, 18 Nov 2019 16:47:48 +0100 Subject: [PATCH 06/17] DOC: Added whatsnew entry. --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cb68bd0e762c4..b95562da7b647 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -438,6 +438,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) +- Bug in :meth:`SeriesGroupBy.count` missing unobserved categories when ``observed=False`` (:issue:`17605`) Reshaping ^^^^^^^^^ From ef6a1ffbf797f8e31c2e427c21d666d651f9f190 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 10:57:57 +0100 Subject: [PATCH 07/17] TST: Improved and expanded test cases (GH17605) --- pandas/tests/groupby/test_categorical.py | 73 ++++++++++++++++++++---- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8931a42bd8fdd..a9c6735eac432 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1254,10 +1254,15 @@ def test_get_nonexistent_category(): ) -@pytest.mark.parametrize("aggregation", ["sum", "mean", "min", "count"]) @pytest.mark.parametrize("observed", [True, False]) -def test_series_groupby_on_2_categoricals_unobserved(aggregation: str, observed: bool): +def test_series_groupby_on_2_categoricals_unobserved( + reduction_func: str, observed: bool +): # GH 17605 + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), @@ -1265,16 +1270,62 @@ def test_series_groupby_on_2_categoricals_unobserved(aggregation: str, observed: "value": [0.1] * 4, } ) + args = {"nth": [0]}.get(reduction_func, []) - # Expect 1 observation for each combination of categories - if observed: - expected_length = 4 - else: - expected_length = len(df["cat_1"].cat.categories) * len( - df["cat_2"].cat.categories - ) + expected_length = 4 if observed else 16 series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] - agg = getattr(series_groupby, aggregation) - result = agg() + agg = getattr(series_groupby, reduction_func) + result = agg(*args) + assert len(result) == expected_length + + +@pytest.mark.parametrize("func, zero_or_nan", [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), +]) +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): + # GH 17605 + # Tests whether the unobserved categories in the result contain 0 or NaN + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + } + ) + unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] + args = {"nth": [0]}.get(func, []) + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + agg = getattr(series_groupby, func) + result = agg(*args) + + for idx in unobserved: + val = result.loc[idx] + assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + + # If we expect unobserved values to be zero, we also expect the dtype to be int + if zero_or_nan == 0: + assert np.issubdtype(result.dtype, np.integer) From 82b1a94e6bbb2fbc41f253b4e7d03acb94000009 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 11:01:18 +0100 Subject: [PATCH 08/17] BUG: Fixed nunique,count,size,nth not respecting observed=False --- pandas/core/groupby/generic.py | 5 +++-- pandas/core/groupby/groupby.py | 15 +++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 903ba447c910c..f5ec763db390f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -569,7 +569,8 @@ def nunique(self, dropna: bool = True) -> Series: res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, index=ri, name=self._selection_name) + result = Series(res, index=ri, name=self._selection_name) + return self._reindex_output(result, fill_value=0) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -727,7 +728,7 @@ def count(self) -> Series: name=self._selection_name, dtype="int64", ) - return self._reindex_output(result) + return self._reindex_output(result, fill_value=0) def _apply_to_column_groupbys(self, func): """ return a pass thru """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 236df4b3854a4..3b7f3a39c3932 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1296,7 +1296,7 @@ def size(self): if isinstance(self.obj, Series): result.name = self.obj.name - return result + return self._reindex_output(result, fill_value=0) @classmethod def _add_numeric_operations(cls): @@ -1743,6 +1743,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra if not self.observed and isinstance(result_index, CategoricalIndex): out = out.reindex(result_index) + out = self._reindex_output(out) return out.sort_index() if self.sort else out # dropna is truthy @@ -2383,7 +2384,7 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] - def _reindex_output(self, output): + def _reindex_output(self, output, fill_value=np.NaN): """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding @@ -2429,7 +2430,11 @@ def _reindex_output(self, output): ).sortlevel() if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, "copy": False} + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } return output.reindex(**d) # GH 13204 @@ -2451,7 +2456,9 @@ def _reindex_output(self, output): output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex(index, copy=False) + output = output.set_index(self.grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) From 7a3c2e524f8c59f1fe9bf57f940800e321b79471 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 11:04:18 +0100 Subject: [PATCH 09/17] DOC: Updated whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b95562da7b647..42b8c415dfb4c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -438,7 +438,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) -- Bug in :meth:`SeriesGroupBy.count` missing unobserved categories when ``observed=False`` (:issue:`17605`) +- Bug in :meth:`SeriesGroupBy.count`, :meth:`SeriesGroupBy.size`, :meth:`SeriesGroupBy.nunique` and :meth:`SeriesGroupBy.nth` missing unobserved categories when ``observed=False`` (:issue:`17605`) Reshaping ^^^^^^^^^ From 07d69e6f6b7f69e8da29af1cf4a2ed968dcd1319 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 12:42:50 +0100 Subject: [PATCH 10/17] CLN: Style fix with Black --- pandas/tests/groupby/test_categorical.py | 51 +++++++++++++----------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index a9c6735eac432..e2993839c3604 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1281,30 +1281,33 @@ def test_series_groupby_on_2_categoricals_unobserved( assert len(result) == expected_length -@pytest.mark.parametrize("func, zero_or_nan", [ - ("all", np.NaN), - ("any", np.NaN), - ("count", 0), - ("first", np.NaN), - ("idxmax", np.NaN), - ("idxmin", np.NaN), - ("last", np.NaN), - ("mad", np.NaN), - ("max", np.NaN), - ("mean", np.NaN), - ("median", np.NaN), - ("min", np.NaN), - ("nth", np.NaN), - ("nunique", 0), - ("prod", np.NaN), - ("quantile", np.NaN), - ("sem", np.NaN), - ("size", 0), - ("skew", np.NaN), - ("std", np.NaN), - ("sum", np.NaN), - ("var", np.NaN), -]) +@pytest.mark.parametrize( + "func, zero_or_nan", + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), + ], +) def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): # GH 17605 # Tests whether the unobserved categories in the result contain 0 or NaN From f76859725dd8e73e62320b5110a8ba18870d487c Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 15:24:22 +0100 Subject: [PATCH 11/17] TST: Removed redundant 'observed' fixture. --- pandas/tests/groupby/test_categorical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e2993839c3604..5f78e4860f1e9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1254,7 +1254,6 @@ def test_get_nonexistent_category(): ) -@pytest.mark.parametrize("observed", [True, False]) def test_series_groupby_on_2_categoricals_unobserved( reduction_func: str, observed: bool ): From 1ebe0d598b4355b618b1e16180f4a9f45f5596af Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 15:25:01 +0100 Subject: [PATCH 12/17] DOC: Add type annotations and docstring to _reindex_output. --- pandas/core/groupby/groupby.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3b7f3a39c3932..94b54e27bf923 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -19,6 +19,7 @@ class providing the base-class of operations. import numpy as np from pandas._config.config import option_context +from pandas._typing import Scalar from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby @@ -2384,7 +2385,9 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] - def _reindex_output(self, output, fill_value=np.NaN): + def _reindex_output( + self, output: Union[Series, DataFrame], fill_value: Scalar = np.NaN + ) -> Union[Series, DataFrame]: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding @@ -2400,6 +2403,8 @@ def _reindex_output(self, output, fill_value=np.NaN): ---------- output: Series or DataFrame Object resulting from grouping and applying an operation. + fill_value: scalar, default np.NaN + Value to use for unobserved categories if self.observed is False. Returns ------- From 210d2037e5c627a911c59ff73a5fcd0ae4715311 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 15:25:53 +0100 Subject: [PATCH 13/17] CLN: Fixed import sorting --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 94b54e27bf923..22e61e4433e2b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -19,7 +19,6 @@ class providing the base-class of operations. import numpy as np from pandas._config.config import option_context -from pandas._typing import Scalar from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby @@ -40,6 +39,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna +from pandas._typing import Scalar from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, try_cast_to_ea From 9ce7fa14eb22db766cd41aea6ade4332a8b5a51b Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 16:34:00 +0100 Subject: [PATCH 14/17] DOC: Added api change subsection to whatsnew. --- doc/source/whatsnew/v1.0.0.rst | 42 +++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 42b8c415dfb4c..e6c4ca9cd71c9 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -183,6 +183,47 @@ New repr for :class:`pandas.core.arrays.IntervalArray` pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + +All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + +- :meth:`SeriesGroupBy.count` +- :meth:`SeriesGroupBy.size` +- :meth:`SeriesGroupBy.nunique` +- :meth:`SeriesGroupBy.nth` + +.. ipython:: python + + df = pd.DataFrame({ + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + }) + df + + +*pandas 0.25.x* + +.. code-block:: ipython + + In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + Out[2]: + cat_1 cat_2 + A A 1 + B 1 + B A 1 + B 1 + Name: value, dtype: int64 + + +*pandas 1.0.0* + +.. ipython:: python + + df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + + .. _whatsnew_1000.api.other: Other API changes @@ -438,7 +479,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) -- Bug in :meth:`SeriesGroupBy.count`, :meth:`SeriesGroupBy.size`, :meth:`SeriesGroupBy.nunique` and :meth:`SeriesGroupBy.nth` missing unobserved categories when ``observed=False`` (:issue:`17605`) Reshaping ^^^^^^^^^ From 8858be57ce91b54d30fdb627629729d6218bae03 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 16:39:37 +0100 Subject: [PATCH 15/17] CLN: Minor docstring fix after validation. --- pandas/core/groupby/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 22e61e4433e2b..51526df94ffcf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2401,9 +2401,9 @@ def _reindex_output( Parameters ---------- - output: Series or DataFrame + output : Series or DataFrame Object resulting from grouping and applying an operation. - fill_value: scalar, default np.NaN + fill_value : scalar, default np.NaN Value to use for unobserved categories if self.observed is False. Returns From d8e27750873a4362cfc9b79fb567af568d0b0da9 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Tue, 19 Nov 2019 17:11:54 +0100 Subject: [PATCH 16/17] DOC: Fixed rst linting error. --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e6c4ca9cd71c9..75d47938f983a 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -195,7 +195,7 @@ The following methods now also correctly output values for unobserved categories .. ipython:: python - df = pd.DataFrame({ + df = pd.DataFrame({ "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), "value": [0.1] * 4, From 995d6588fcae7686e2f4ae2b8d1f0206d23e6488 Mon Sep 17 00:00:00 2001 From: Oliver Hofkens Date: Wed, 20 Nov 2019 08:46:14 +0100 Subject: [PATCH 17/17] DOC: Fixed typing for _reindex_output. --- pandas/core/groupby/groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 51526df94ffcf..62bbb151f793e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -39,7 +39,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna -from pandas._typing import Scalar +from pandas._typing import FrameOrSeries, Scalar from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, try_cast_to_ea @@ -2386,8 +2386,8 @@ def tail(self, n=5): return self._selected_obj[mask] def _reindex_output( - self, output: Union[Series, DataFrame], fill_value: Scalar = np.NaN - ) -> Union[Series, DataFrame]: + self, output: FrameOrSeries, fill_value: Scalar = np.NaN + ) -> FrameOrSeries: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding