Skip to content

CLN: Clean groupby/test_function.py #32027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Feb 20, 2020
223 changes: 134 additions & 89 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,26 @@
from pandas.util import _test_decorators as td


@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64],
ids=["np.int32", "np.int64", "np.float32", "np.float64"],
)
def numpy_dtypes_for_minmax(request):
"""
Fixture of numpy dtypes with min and max values used for testing
cummin and cummax
"""
dtype = request.param
min_val = (
np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min
)
max_val = (
np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max
)

return (dtype, min_val, max_val)


@pytest.mark.parametrize("agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -174,11 +194,10 @@ def test_arg_passthru():
)

for attr in ["mean", "median"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)

# TODO: min, max *should* handle
Expand All @@ -195,11 +214,10 @@ def test_arg_passthru():
]
)
for attr in ["min", "max"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(
Expand All @@ -215,52 +233,47 @@ def test_arg_passthru():
]
)
for attr in ["first", "last"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "string", "category_int", "timedelta"])
for attr in ["sum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
result = df.groupby("group").sum()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = df.groupby("group").sum(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int"])
for attr in ["prod", "cumprod"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

# like min, max, but don't include strings
expected_columns = Index(
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)
for attr in ["cummin", "cummax"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
# GH 15561: numeric_only=False set by default like min/max
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int", "timedelta"])
for attr in ["cumsum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
result = getattr(df.groupby("group"), "cumsum")()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = getattr(df.groupby("group"), "cumsum")(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)


def test_non_cython_api():
Expand Down Expand Up @@ -691,59 +704,31 @@ def test_numpy_compat(func):
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummin_cummax():
def test_cummin(numpy_dtypes_for_minmax):
dtype = numpy_dtypes_for_minmax[0]
min_val = numpy_dtypes_for_minmax[1]

# GH 15048
num_types = [np.int32, np.int64, np.float32, np.float64]
num_mins = [
np.iinfo(np.int32).min,
np.iinfo(np.int64).min,
np.finfo(np.float32).min,
np.finfo(np.float64).min,
]
num_max = [
np.iinfo(np.int32).max,
np.iinfo(np.int64).max,
np.finfo(np.float32).max,
np.finfo(np.float64).max,
]
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
df = base_df.astype(dtype)
df = base_df.astype(dtype)

# cummin
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummin w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# cummax
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummax w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
Expand All @@ -753,41 +738,101 @@ def test_cummin_cummax():
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummin_all_nan_column():
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})

# Test nan in entire column
base_df["B"] = np.nan
expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(expected, result)


@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummax(numpy_dtypes_for_minmax):
dtype = numpy_dtypes_for_minmax[0]
max_val = numpy_dtypes_for_minmax[2]

# GH 15048
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

df = base_df.astype(dtype)

expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")
for method in ["cummax", "cummin"]:
result = getattr(df.groupby("a")["b"], method)()
tm.assert_series_equal(expected, result)

result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
result = df.groupby("a").b.cummax()
expected = pd.Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)

df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummax_all_nan_column():
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})

expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)


@pytest.mark.parametrize(
Expand Down