Skip to content

TST: fix test_empty_groupby xfails #51075

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 1, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 49 additions & 102 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1858,6 +1858,7 @@ def test_pivot_table_values_key_error():
Categorical([0]),
[to_datetime(0)],
date_range(0, 1, 1, tz="US/Eastern"),
pd.period_range("2016-01-01", periods=3, freq="D"),
pd.array([0], dtype="Int64"),
pd.array([0], dtype="Float64"),
pd.array([False], dtype="boolean"),
Expand All @@ -1870,6 +1871,7 @@ def test_pivot_table_values_key_error():
"cat",
"dt64",
"dt64tz",
"period",
"Int64",
"Float64",
"boolean",
Expand All @@ -1886,13 +1888,6 @@ def test_empty_groupby(
override_dtype = None

if (
isinstance(values, Categorical)
and not isinstance(columns, list)
and op in ["sum", "prod", "skew"]
):
# handled below GH#41291
pass
elif (
isinstance(values, Categorical)
and len(keys) == 1
and op in ["idxmax", "idxmin"]
Expand All @@ -1901,18 +1896,8 @@ def test_empty_groupby(
raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
)
request.node.add_marker(mark)
elif isinstance(values, Categorical) and len(keys) == 1 and op in ["sum", "prod"]:
mark = pytest.mark.xfail(
raises=AssertionError, match="(DataFrame|Series) are different"
)
request.node.add_marker(mark)
elif isinstance(values, Categorical) and len(keys) == 2 and op in ["sum"]:
mark = pytest.mark.xfail(
raises=AssertionError, match="(DataFrame|Series) are different"
)
request.node.add_marker(mark)

elif isinstance(values, BooleanArray) and op in ["sum", "prod"]:
if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
# We expect to get Int64 back for these
override_dtype = "Int64"

Expand All @@ -1936,6 +1921,26 @@ def get_result(**kwargs):
else:
return getattr(gb, method)(op, **kwargs)

def get_categorical_invalid_expected():
# Categorical is special without 'observed=True', we get an NaN entry
# corresponding to the unobserved group. If we passed observed=True
# to groupby, expected would just be 'df.set_index(keys)[columns]'
# as below
lev = Categorical([0], dtype=values.dtype)
if len(keys) != 1:
idx = MultiIndex.from_product([lev, lev], names=keys)
else:
# all columns are dropped, but we end up with one row
# Categorical is special without 'observed=True'
idx = Index(lev, name=keys[0])

expected = DataFrame([], columns=[], index=idx)
return expected

is_per = isinstance(df.dtypes[0], pd.PeriodDtype)
is_dt64 = df.dtypes[0].kind == "M"
is_cat = isinstance(values, Categorical)

if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
msg = f"Cannot perform {op} with non-ordered Categorical"
with pytest.raises(TypeError, match=msg):
Expand All @@ -1944,105 +1949,47 @@ def get_result(**kwargs):
if isinstance(columns, list):
# i.e. DataframeGroupBy, not SeriesGroupBy
result = get_result(numeric_only=True)

# Categorical is special without 'observed=True', we get an NaN entry
# corresponding to the unobserved group. If we passed observed=True
# to groupby, expected would just be 'df.set_index(keys)[columns]'
# as below
lev = Categorical([0], dtype=values.dtype)
if len(keys) != 1:
idx = MultiIndex.from_product([lev, lev], names=keys)
else:
# all columns are dropped, but we end up with one row
# Categorical is special without 'observed=True'
idx = Index(lev, name=keys[0])

expected = DataFrame([], columns=[], index=idx)
expected = get_categorical_invalid_expected()
tm.assert_equal(result, expected)
return

if columns == "C":
# i.e. SeriesGroupBy
if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if df.dtypes[0].kind == "M":
# GH#41291
# datetime64 -> prod and sum are invalid
if op == "skew":
msg = "does not support reduction 'skew'"
else:
msg = "datetime64 type does not support"
with pytest.raises(TypeError, match=msg):
get_result()

return
if op in ["prod", "sum", "skew"]:
if isinstance(values, Categorical):
# GH#41291
if op == "skew":
msg = f"does not support reduction '{op}'"
else:
msg = "category type does not support"
with pytest.raises(TypeError, match=msg):
get_result()
if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if is_dt64 or is_cat or is_per:
# GH#41291
# datetime64 -> prod and sum are invalid
if op == "skew":
msg = "does not support reduction 'skew'"
elif is_dt64:
msg = "datetime64 type does not support"
elif is_per:
msg = "Period type does not support"
else:
msg = "category type does not support"
with pytest.raises(TypeError, match=msg):
get_result()

if not isinstance(columns, list):
# i.e. SeriesGroupBy
return
else:
# ie. DataFrameGroupBy
if op in ["prod", "sum"]:
# ops that require more than just ordered-ness
if df.dtypes[0].kind == "M":
# GH#41291
# datetime64 -> prod and sum are invalid
with pytest.raises(TypeError, match="datetime64 type does not support"):
get_result()
result = get_result(numeric_only=True)

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
expected = df.set_index(keys)[[]]
tm.assert_equal(result, expected)
elif op == "skew":
# TODO: test the numeric_only=True case
return

elif isinstance(values, Categorical):
else:
# i.e. op in ["prod", "sum"]:
# i.e. DataFrameGroupBy
# ops that require more than just ordered-ness
# GH#41291
# Categorical doesn't implement sum or prod
with pytest.raises(TypeError, match="category type does not support"):
get_result()
result = get_result(numeric_only=True)

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
expected = df.set_index(keys)[[]]
if len(keys) != 1 and op == "prod":
# TODO: why just prod and not sum?
# Categorical is special without 'observed=True'
lev = Categorical([0], dtype=values.dtype)
mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
expected = DataFrame([], columns=[], index=mi)

tm.assert_equal(result, expected)
return

elif df.dtypes[0] == object:
result = get_result()
expected = df.set_index(keys)[["C"]]
if is_cat:
expected = get_categorical_invalid_expected()
tm.assert_equal(result, expected)
return

if op == "skew" and (
isinstance(values, Categorical) or df.dtypes[0].kind == "M"
):
msg = "|".join(
[
"Categorical is not ordered",
"does not support reduction",
]
)
with pytest.raises(TypeError, match=msg):
get_result()
return

result = get_result()
expected = df.set_index(keys)[columns]
if override_dtype is not None:
Expand Down