From f717a7ea152ba4455ac7b1be41e2a7a6449b5488 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 26 Jun 2020 21:37:50 +0100 Subject: [PATCH 1/5] tests for dataframe.groupby with 2 Categoricals --- pandas/tests/groupby/test_categorical.py | 135 +++++++++++++++++------ 1 file changed, 104 insertions(+), 31 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ff35ec04952b1..165b35f43352a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1259,11 +1259,100 @@ def test_get_nonexistent_category(): ) +def test_dataframe_groupby_on_2_categoricals_when_observed_is_true( + reduction_func:str): + + if reduction_func == 'ngroup': + pytest.skip("ngroup does not return the Categories on the index") + + res, unobserved_cats = _dataframe_groupby_on_2_categoricals( + reduction_func, observed=True) + + for cat in unobserved_cats: + assert cat not in res.index + + +def _dataframe_groupby_on_2_categoricals(reduction_func:str, observed:bool): + + df = pd.DataFrame({ + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("1111"), categories=list("12")), + "value": [.1, .1, .1, .1] + }) + unobserved_cats = [ + ('A', '2'), + ('B', '2'), + ('C', '1'), + ('C', '2') + ] + + df_grp = df.groupby(['cat_1', 'cat_2'], observed=observed) + + args = { + 'nth' : [0], + 'corrwith' : [df] + }.get(reduction_func, []) + res = getattr(df_grp, reduction_func)(*args) + + return res, unobserved_cats + + +_results_for_groupbys_with_missing_categories = dict([ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("corrwith", np.NaN), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), +]) + + +@pytest.mark.parametrize('observed', [False, None]) +def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( + reduction_func:str, observed:bool, request): + + if reduction_func == 'ngroup': + pytest.skip("ngroup does not return the Categories on the index") + + if reduction_func == 'count': + mark = pytest.mark.xfail( + reason=("DataFrameGroupBy.count returns np.NaN for missing " + "categories, when it should return 0")) + request.node.add_marker(mark) + + res, unobserved_cats = _dataframe_groupby_on_2_categoricals( + reduction_func, observed) + + expected = _results_for_groupbys_with_missing_categories[reduction_func] + + if expected is np.nan: + assert res.loc[unobserved_cats].isnull().all().all() + else: + assert (res.loc[unobserved_cats] == expected).all().all() + + + def test_series_groupby_on_2_categoricals_unobserved( reduction_func: str, observed: bool, request ): # GH 17605 - if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") @@ -1289,36 +1378,18 @@ def test_series_groupby_on_2_categoricals_unobserved( assert len(result) == expected_length -@pytest.mark.parametrize( - "func, zero_or_nan", - [ - ("all", np.NaN), - ("any", np.NaN), - ("count", 0), - ("first", np.NaN), - ("idxmax", np.NaN), - ("idxmin", np.NaN), - ("last", np.NaN), - ("mad", np.NaN), - ("max", np.NaN), - ("mean", np.NaN), - ("median", np.NaN), - ("min", np.NaN), - ("nth", np.NaN), - ("nunique", 0), - ("prod", np.NaN), - ("quantile", np.NaN), - ("sem", np.NaN), - ("size", 0), - ("skew", np.NaN), - ("std", np.NaN), - ("sum", np.NaN), - ("var", np.NaN), - ], -) -def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( + reduction_func:str, request): # GH 17605 # Tests whether the unobserved categories in the result contain 0 or NaN + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + + if reduction_func == "corrwith": # GH 32293 + mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") + request.node.add_marker(mark) + df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1327,11 +1398,13 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o } ) unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] - args = {"nth": [0]}.get(func, []) + args = {"nth": [0]}.get(reduction_func, []) series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] - agg = getattr(series_groupby, func) + agg = getattr(series_groupby, reduction_func) result = agg(*args) + + zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] for idx in unobserved: val = result.loc[idx] From 248c191478aabacffec47105c3fe6caf22b0dcc1 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 26 Jun 2020 21:44:51 +0100 Subject: [PATCH 2/5] black --- pandas/tests/groupby/test_categorical.py | 99 ++++++++++++------------ 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 165b35f43352a..7493f3fad34f1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1259,45 +1259,40 @@ def test_get_nonexistent_category(): ) -def test_dataframe_groupby_on_2_categoricals_when_observed_is_true( - reduction_func:str): +def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): - if reduction_func == 'ngroup': + if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") res, unobserved_cats = _dataframe_groupby_on_2_categoricals( - reduction_func, observed=True) - + reduction_func, observed=True + ) + for cat in unobserved_cats: - assert cat not in res.index - - -def _dataframe_groupby_on_2_categoricals(reduction_func:str, observed:bool): - - df = pd.DataFrame({ - "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), - "cat_2": pd.Categorical(list("1111"), categories=list("12")), - "value": [.1, .1, .1, .1] - }) - unobserved_cats = [ - ('A', '2'), - ('B', '2'), - ('C', '1'), - ('C', '2') - ] - - df_grp = df.groupby(['cat_1', 'cat_2'], observed=observed) - - args = { - 'nth' : [0], - 'corrwith' : [df] - }.get(reduction_func, []) + assert cat not in res.index + + +def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool): + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("1111"), categories=list("12")), + "value": [0.1, 0.1, 0.1, 0.1], + } + ) + unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")] + + df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) + + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) res = getattr(df_grp, reduction_func)(*args) - + return res, unobserved_cats -_results_for_groupbys_with_missing_categories = dict([ +_results_for_groupbys_with_missing_categories = dict( + [ ("all", np.NaN), ("any", np.NaN), ("count", 0), @@ -1321,33 +1316,38 @@ def _dataframe_groupby_on_2_categoricals(reduction_func:str, observed:bool): ("std", np.NaN), ("sum", np.NaN), ("var", np.NaN), -]) + ] +) -@pytest.mark.parametrize('observed', [False, None]) +@pytest.mark.parametrize("observed", [False, None]) def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( - reduction_func:str, observed:bool, request): - - if reduction_func == 'ngroup': + reduction_func: str, observed: bool, request +): + + if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - - if reduction_func == 'count': + + if reduction_func == "count": mark = pytest.mark.xfail( - reason=("DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0")) + reason=( + "DataFrameGroupBy.count returns np.NaN for missing " + "categories, when it should return 0" + ) + ) request.node.add_marker(mark) res, unobserved_cats = _dataframe_groupby_on_2_categoricals( - reduction_func, observed) - + reduction_func, observed + ) + expected = _results_for_groupbys_with_missing_categories[reduction_func] - + if expected is np.nan: assert res.loc[unobserved_cats].isnull().all().all() else: assert (res.loc[unobserved_cats] == expected).all().all() - - + def test_series_groupby_on_2_categoricals_unobserved( reduction_func: str, observed: bool, request @@ -1379,17 +1379,18 @@ def test_series_groupby_on_2_categoricals_unobserved( def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( - reduction_func:str, request): + reduction_func: str, request +): # GH 17605 # Tests whether the unobserved categories in the result contain 0 or NaN - + if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") - + if reduction_func == "corrwith": # GH 32293 mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") request.node.add_marker(mark) - + df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1403,7 +1404,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] agg = getattr(series_groupby, reduction_func) result = agg(*args) - + zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] for idx in unobserved: From 8621e759804ccb202b48594a219bc45cfa081c85 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 27 Jun 2020 13:07:24 +0100 Subject: [PATCH 3/5] add issue number to test comments --- pandas/tests/groupby/test_categorical.py | 156 +++++++++++------------ 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7493f3fad34f1..b0d28d4b3cfbe 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1259,36 +1259,33 @@ def test_get_nonexistent_category(): ) -def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): - +def test_series_groupby_on_2_categoricals_unobserved( + reduction_func: str, observed: bool, request +): + # GH 17605 if reduction_func == "ngroup": - pytest.skip("ngroup does not return the Categories on the index") - - res, unobserved_cats = _dataframe_groupby_on_2_categoricals( - reduction_func, observed=True - ) - - for cat in unobserved_cats: - assert cat not in res.index - + pytest.skip("ngroup is not truly a reduction") -def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool): + if reduction_func == "corrwith": # GH 32293 + mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") + request.node.add_marker(mark) df = pd.DataFrame( { - "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), - "cat_2": pd.Categorical(list("1111"), categories=list("12")), - "value": [0.1, 0.1, 0.1, 0.1], + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4, } ) - unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")] + args = {"nth": [0]}.get(reduction_func, []) - df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) + expected_length = 4 if observed else 16 - args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) - res = getattr(df_grp, reduction_func)(*args) + series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] + agg = getattr(series_groupby, reduction_func) + result = agg(*args) - return res, unobserved_cats + assert len(result) == expected_length _results_for_groupbys_with_missing_categories = dict( @@ -1320,64 +1317,6 @@ def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool): ) -@pytest.mark.parametrize("observed", [False, None]) -def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( - reduction_func: str, observed: bool, request -): - - if reduction_func == "ngroup": - pytest.skip("ngroup does not return the Categories on the index") - - if reduction_func == "count": - mark = pytest.mark.xfail( - reason=( - "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0" - ) - ) - request.node.add_marker(mark) - - res, unobserved_cats = _dataframe_groupby_on_2_categoricals( - reduction_func, observed - ) - - expected = _results_for_groupbys_with_missing_categories[reduction_func] - - if expected is np.nan: - assert res.loc[unobserved_cats].isnull().all().all() - else: - assert (res.loc[unobserved_cats] == expected).all().all() - - -def test_series_groupby_on_2_categoricals_unobserved( - reduction_func: str, observed: bool, request -): - # GH 17605 - if reduction_func == "ngroup": - pytest.skip("ngroup is not truly a reduction") - - if reduction_func == "corrwith": # GH 32293 - mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") - request.node.add_marker(mark) - - df = pd.DataFrame( - { - "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), - "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), - "value": [0.1] * 4, - } - ) - args = {"nth": [0]}.get(reduction_func, []) - - expected_length = 4 if observed else 16 - - series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] - agg = getattr(series_groupby, reduction_func) - result = agg(*args) - - assert len(result) == expected_length - - def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( reduction_func: str, request ): @@ -1416,6 +1355,67 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( assert np.issubdtype(result.dtype, np.integer) +def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): + + if reduction_func == "ngroup": + pytest.skip("ngroup does not return the Categories on the index") + + res, unobserved_cats = _dataframe_groupby_on_2_categoricals( + reduction_func, observed=True + ) + + for cat in unobserved_cats: + assert cat not in res.index + + +def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool): + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("1111"), categories=list("12")), + "value": [0.1, 0.1, 0.1, 0.1], + } + ) + unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")] + + df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) + + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + res = getattr(df_grp, reduction_func)(*args) + + return res, unobserved_cats + + +@pytest.mark.parametrize("observed", [False, None]) +def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( + reduction_func: str, observed: bool, request +): + + if reduction_func == "ngroup": + pytest.skip("ngroup does not return the Categories on the index") + + if reduction_func == "count": + mark = pytest.mark.xfail( + reason=( + "DataFrameGroupBy.count returns np.NaN for missing " + "categories, when it should return 0" + ) + ) + request.node.add_marker(mark) + + res, unobserved_cats = _dataframe_groupby_on_2_categoricals( + reduction_func, observed + ) + + expected = _results_for_groupbys_with_missing_categories[reduction_func] + + if expected is np.nan: + assert res.loc[unobserved_cats].isnull().all().all() + else: + assert (res.loc[unobserved_cats] == expected).all().all() + + def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} From d1dcc61aafca10d82a2d2b65ec398b6ef54c2315 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 27 Jun 2020 13:39:20 +0100 Subject: [PATCH 4/5] expected output for .sum() changed from NaN to 0. tests marked with xfail and reference to GH issues. --- pandas/tests/groupby/test_categorical.py | 33 +++++++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b0d28d4b3cfbe..2e0e51ee1506b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1311,7 +1311,7 @@ def test_series_groupby_on_2_categoricals_unobserved( ("size", 0), ("skew", np.NaN), ("std", np.NaN), - ("sum", np.NaN), + ("sum", 0), ("var", np.NaN), ] ) @@ -1330,6 +1330,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") request.node.add_marker(mark) + if reduction_func == "sum": # GH 31422 + mark = pytest.mark.xfail( + reason=( + "sum should return 0 but currently returns NaN. " + "This is a known bug. See GH 31422." + ) + ) + request.node.add_marker(mark) + df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1356,7 +1365,10 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): - + # GH 23865 + # GH 27075 + # Ensure that df.groupby, when 'by' is two pd.Categorical variables, + # does not return the categories that are not in df when observed=True if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") @@ -1391,15 +1403,28 @@ def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool): def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( reduction_func: str, observed: bool, request ): + # GH 23865 + # GH 27075 + # Ensure that df.groupby, when 'by' is two pd.Categorical variables, + # returns the categories that are not in df when observed=False/None if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "count": + if reduction_func == "count": # GH 35028 mark = pytest.mark.xfail( reason=( "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0" + "categories, when it should return 0. See GH 35028" + ) + ) + request.node.add_marker(mark) + + if reduction_func == "sum": # GH 31422 + mark = pytest.mark.xfail( + reason=( + "sum should return 0 but currently returns NaN. " + "This is a known bug. See GH 31422." ) ) request.node.add_marker(mark) From bc4a3b82b50cd6225b1188401eb79abdc3e8ac57 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 1 Jul 2020 01:11:00 +0100 Subject: [PATCH 5/5] responding to PR comments --- pandas/tests/groupby/test_categorical.py | 101 +++++++++++++---------- 1 file changed, 56 insertions(+), 45 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 2e0e51ee1506b..0c0646e73d6db 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -36,6 +36,41 @@ def f(a): return result.reindex(index).sort_index() +_results_for_groupbys_with_missing_categories = dict( + # This maps the builtin groupby functions to their expected outputs for + # missing categories when they are called on a categorical grouper with + # observed=False. Some functions are expected to return NaN, some zero. + # These expected values can be used across several tests (i.e. they are + # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be + # hardcoded in one place. + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("corrwith", np.NaN), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", 0), + ("var", np.NaN), + ] +) + + def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) @@ -1267,7 +1302,9 @@ def test_series_groupby_on_2_categoricals_unobserved( pytest.skip("ngroup is not truly a reduction") if reduction_func == "corrwith": # GH 32293 - mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") + mark = pytest.mark.xfail( + reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293" + ) request.node.add_marker(mark) df = pd.DataFrame( @@ -1288,35 +1325,6 @@ def test_series_groupby_on_2_categoricals_unobserved( assert len(result) == expected_length -_results_for_groupbys_with_missing_categories = dict( - [ - ("all", np.NaN), - ("any", np.NaN), - ("count", 0), - ("corrwith", np.NaN), - ("first", np.NaN), - ("idxmax", np.NaN), - ("idxmin", np.NaN), - ("last", np.NaN), - ("mad", np.NaN), - ("max", np.NaN), - ("mean", np.NaN), - ("median", np.NaN), - ("min", np.NaN), - ("nth", np.NaN), - ("nunique", 0), - ("prod", np.NaN), - ("quantile", np.NaN), - ("sem", np.NaN), - ("size", 0), - ("skew", np.NaN), - ("std", np.NaN), - ("sum", 0), - ("var", np.NaN), - ] -) - - def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( reduction_func: str, request ): @@ -1327,7 +1335,9 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( pytest.skip("ngroup is not truly a reduction") if reduction_func == "corrwith": # GH 32293 - mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") + mark = pytest.mark.xfail( + reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293" + ) request.node.add_marker(mark) if reduction_func == "sum": # GH 31422 @@ -1372,16 +1382,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - res, unobserved_cats = _dataframe_groupby_on_2_categoricals( - reduction_func, observed=True - ) - - for cat in unobserved_cats: - assert cat not in res.index - - -def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool): - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1391,12 +1391,13 @@ def _dataframe_groupby_on_2_categoricals(reduction_func: str, observed: bool): ) unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")] - df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) + df_grp = df.groupby(["cat_1", "cat_2"], observed=True) args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) res = getattr(df_grp, reduction_func)(*args) - return res, unobserved_cats + for cat in unobserved_cats: + assert cat not in res.index @pytest.mark.parametrize("observed", [False, None]) @@ -1429,9 +1430,19 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( ) request.node.add_marker(mark) - res, unobserved_cats = _dataframe_groupby_on_2_categoricals( - reduction_func, observed + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("1111"), categories=list("12")), + "value": [0.1, 0.1, 0.1, 0.1], + } ) + unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")] + + df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) + + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + res = getattr(df_grp, reduction_func)(*args) expected = _results_for_groupbys_with_missing_categories[reduction_func]