pandas-dev · jreback · Jul 8, 2020 · Jun 26, 2020 · Jun 26, 2020 · Jun 27, 2020
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1263,7 +1263,6 @@ def test_series_groupby_on_2_categoricals_unobserved(
     reduction_func: str, observed: bool, request
 ):
     # GH 17605
-
     if reduction_func == "ngroup":
         pytest.skip("ngroup is not truly a reduction")
 
@@ -1289,12 +1288,12 @@ def test_series_groupby_on_2_categoricals_unobserved(
     assert len(result) == expected_length
 
 
-@pytest.mark.parametrize(
-    "func, zero_or_nan",
+_results_for_groupbys_with_missing_categories = dict(
     [
         ("all", np.NaN),
         ("any", np.NaN),
         ("count", 0),
+        ("corrwith", np.NaN),
         ("first", np.NaN),
         ("idxmax", np.NaN),
         ("idxmin", np.NaN),
@@ -1312,13 +1311,34 @@ def test_series_groupby_on_2_categoricals_unobserved(
         ("size", 0),
         ("skew", np.NaN),
         ("std", np.NaN),
-        ("sum", np.NaN),
+        ("sum", 0),
         ("var", np.NaN),
-    ],
+    ]
 )
-def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
+
+
+def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
+    reduction_func: str, request
+):
     # GH 17605
     # Tests whether the unobserved categories in the result contain 0 or NaN
+
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup is not truly a reduction")
+
+    if reduction_func == "corrwith":  # GH 32293
+        mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith")
+        request.node.add_marker(mark)
+
+    if reduction_func == "sum":  # GH 31422
+        mark = pytest.mark.xfail(
+            reason=(
+                "sum should return 0 but currently returns NaN. "
+                "This is a known bug. See GH 31422."
+            )
+        )
+        request.node.add_marker(mark)
+
     df = pd.DataFrame(
         {
             "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
@@ -1327,12 +1347,14 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
         }
     )
     unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
-    args = {"nth": [0]}.get(func, [])
+    args = {"nth": [0]}.get(reduction_func, [])
 
     series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
-    agg = getattr(series_groupby, func)
+    agg = getattr(series_groupby, reduction_func)
     result = agg(*args)
 
+    zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
+
     for idx in unobserved:
         val = result.loc[idx]
         assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
@@ -1342,6 +1364,83 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
         assert np.issubdtype(result.dtype, np.integer)
 
 
+def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str):
+    # GH 23865
+    # GH 27075
+    # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
+    # does not return the categories that are not in df when observed=True
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup does not return the Categories on the index")
+
+    res, unobserved_cats = _dataframe_groupby_on_2_categoricals(
+        reduction_func, observed=True
+    )
+
+    for cat in unobserved_cats:
+        assert cat not in res.index
+
+
+def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool):
+
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
+            "cat_2": pd.Categorical(list("1111"), categories=list("12")),
+            "value": [0.1, 0.1, 0.1, 0.1],
+        }
+    )
+    unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
+
+    df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
+
+    args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
+    res = getattr(df_grp, reduction_func)(*args)
+
+    return res, unobserved_cats
+
+
+@pytest.mark.parametrize("observed", [False, None])
+def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
+    reduction_func: str, observed: bool, request
+):
+    # GH 23865
+    # GH 27075
+    # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
+    # returns the categories that are not in df when observed=False/None
+
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup does not return the Categories on the index")
+
+    if reduction_func == "count":  # GH 35028
+        mark = pytest.mark.xfail(
+            reason=(
+                "DataFrameGroupBy.count returns np.NaN for missing "
+                "categories, when it should return 0. See GH 35028"
+            )
+        )
+        request.node.add_marker(mark)
+
+    if reduction_func == "sum":  # GH 31422
+        mark = pytest.mark.xfail(
+            reason=(
+                "sum should return 0 but currently returns NaN. "
+                "This is a known bug. See GH 31422."
+            )
+        )
+        request.node.add_marker(mark)
+
+    res, unobserved_cats = _dataframe_groupby_on_2_categoricals(
+        reduction_func, observed
+    )
+
+    expected = _results_for_groupbys_with_missing_categories[reduction_func]
+
+    if expected is np.nan:
+        assert res.loc[unobserved_cats].isnull().all().all()
+    else:
+        assert (res.loc[unobserved_cats] == expected).all().all()
+
+
 def test_series_groupby_categorical_aggregation_getitem():
     # GH 8870
     d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}