Skip to content

ENH: support mask in libalgos.rank #46932

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/algos.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def rank_1d(
ascending: bool = ...,
pct: bool = ...,
na_option=...,
mask: npt.NDArray[np.bool_] | None = ...,
) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
def rank_2d(
in_arr: np.ndarray, # ndarray[numeric_object_t, ndim=2]
Expand Down
9 changes: 7 additions & 2 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,7 @@ def rank_1d(
bint ascending=True,
bint pct=False,
na_option="keep",
const uint8_t[:] mask=None,
):
"""
Fast NaN-friendly version of ``scipy.stats.rankdata``.
Expand Down Expand Up @@ -918,6 +919,8 @@ def rank_1d(
* keep: leave NA values where they are
* top: smallest rank if ascending
* bottom: smallest rank if descending
mask : np.ndarray[bool], optional, default None
Specify locations to be treated as NA, for e.g. Categorical.
"""
cdef:
TiebreakEnumType tiebreak
Expand All @@ -927,7 +930,6 @@ def rank_1d(
float64_t[::1] out
ndarray[numeric_object_t, ndim=1] masked_vals
numeric_object_t[:] masked_vals_memview
uint8_t[:] mask
bint keep_na, nans_rank_highest, check_labels, check_mask
numeric_object_t nan_fill_val

Expand Down Expand Up @@ -956,6 +958,7 @@ def rank_1d(
or numeric_object_t is object
or (numeric_object_t is int64_t and is_datetimelike)
)
check_mask = check_mask or mask is not None

# Copy values into new array in order to fill missing data
# with mask, without obfuscating location of missing data
Expand All @@ -965,7 +968,9 @@ def rank_1d(
else:
masked_vals = values.copy()

if numeric_object_t is object:
if mask is not None:
pass
elif numeric_object_t is object:
mask = missing.isnaobj(masked_vals)
elif numeric_object_t is int64_t and is_datetimelike:
mask = (masked_vals == NPY_NAT).astype(np.uint8)
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def group_rank(
ascending: bool = ...,
pct: bool = ...,
na_option: Literal["keep", "top", "bottom"] = ...,
mask: npt.NDArray[np.bool_] | None = ...,
) -> None: ...
def group_max(
out: np.ndarray, # groupby_t[:, ::1]
Expand Down
11 changes: 10 additions & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,7 @@ def group_rank(
bint ascending=True,
bint pct=False,
str na_option="keep",
const uint8_t[:, :] mask=None,
) -> None:
"""
Provides the rank of values within each group.
Expand Down Expand Up @@ -1294,6 +1295,7 @@ def group_rank(
* keep: leave NA values where they are
* top: smallest rank if ascending
* bottom: smallest rank if descending
mask : np.ndarray[bool] or None, default None

Notes
-----
Expand All @@ -1302,18 +1304,25 @@ def group_rank(
cdef:
Py_ssize_t i, k, N
ndarray[float64_t, ndim=1] result
const uint8_t[:] sub_mask

N = values.shape[1]

for k in range(N):
if mask is None:
sub_mask = None
else:
sub_mask = mask[:, k]

result = rank_1d(
values=values[:, k],
labels=labels,
is_datetimelike=is_datetimelike,
ties_method=ties_method,
ascending=ascending,
pct=pct,
na_option=na_option
na_option=na_option,
mask=sub_mask,
)
for i in range(len(result)):
# TODO: why can't we do out[:, k] = result?
Expand Down
45 changes: 40 additions & 5 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
ensure_platform_int,
is_1d_only_ea_dtype,
is_bool_dtype,
is_categorical_dtype,
is_complex_dtype,
is_datetime64_any_dtype,
is_float_dtype,
Expand All @@ -56,12 +55,14 @@
is_timedelta64_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import (
isna,
maybe_fill,
)

from pandas.core.arrays import (
Categorical,
DatetimeArray,
ExtensionArray,
PeriodArray,
Expand Down Expand Up @@ -142,7 +143,15 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:

# "group_any" and "group_all" are also support masks, but don't go
# through WrappedCythonOp
_MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max", "last", "first"}
_MASKED_CYTHON_FUNCTIONS = {
"cummin",
"cummax",
"min",
"max",
"last",
"first",
"rank",
}

_cython_arity = {"ohlc": 4} # OHLC

Expand Down Expand Up @@ -229,12 +238,17 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
# never an invalid op for those dtypes, so return early as fastpath
return

if is_categorical_dtype(dtype):
if isinstance(dtype, CategoricalDtype):
# NotImplementedError for methods that can fall back to a
# non-cython implementation.
if how in ["add", "prod", "cumsum", "cumprod"]:
raise TypeError(f"{dtype} type does not support {how} operations")
raise NotImplementedError(f"{dtype} dtype not supported")
elif how not in ["rank"]:
# only "rank" is implemented in cython
raise NotImplementedError(f"{dtype} dtype not supported")
elif not dtype.ordered:
# TODO: TypeError?
raise NotImplementedError(f"{dtype} dtype not supported")

elif is_sparse(dtype):
# categoricals are only 1d, so we
Expand Down Expand Up @@ -332,6 +346,25 @@ def _ea_wrap_cython_operation(
**kwargs,
)

elif isinstance(values, Categorical) and self.uses_mask():
assert self.how == "rank" # the only one implemented ATM
assert values.ordered # checked earlier
mask = values.isna()
npvalues = values._ndarray

res_values = self._cython_op_ndim_compat(
npvalues,
min_count=min_count,
ngroups=ngroups,
comp_ids=comp_ids,
mask=mask,
**kwargs,
)

# If we ever have more than just "rank" here, we'll need to do
# `if self.how in self.cast_blocklist` like we do for other dtypes.
return res_values

npvalues = self._ea_to_cython_values(values)

res_values = self._cython_op_ndim_compat(
Expand Down Expand Up @@ -551,14 +584,16 @@ def _call_cython_op(
else:
# TODO: min_count
if self.uses_mask():
if self.how != "rank":
# TODO: should rank take result_mask?
kwargs["result_mask"] = result_mask
func(
out=result,
values=values,
labels=comp_ids,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
mask=mask,
result_mask=result_mask,
**kwargs,
)
else:
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,8 @@ def test_rank_avg_even_vals(dtype, upper):

result = df.groupby("key").rank()
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
if upper:
exp_df = exp_df.astype("Float64")
tm.assert_frame_equal(result, exp_df)


Expand Down Expand Up @@ -663,3 +665,17 @@ def test_non_unique_index():
name="value",
)
tm.assert_series_equal(result, expected)


def test_rank_categorical():
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)

df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})

gb = df.groupby("col1")

res = gb.rank()

expected = df.astype(object).groupby("col1").rank()
tm.assert_frame_equal(res, expected)