-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
API: replace dropna=False option with na_sentinel=None in factorize #35852
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7e461a1
1314059
8bcb313
24c3ede
dea38f2
cd9e7ac
e5e912b
045a76f
a61367b
2c451a9
31ffca7
32d029d
ba93eb6
2330908
b9850a2
3a18c65
9d7f1e6
97fd2e6
68527ef
817905c
364aeae
7cd0cce
2368223
8ca0652
b452513
344c072
1a5c358
81a0a7e
4607953
a7d4abd
3ef1459
fca7300
f0a6556
c81e79e
37ca034
4f0f226
5fcabe7
b7cd915
8a2a1f7
e0c7342
0480d9f
5c87cd1
b70e595
076fc10
c945457
e6c7434
7533ab0
bf8641a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -526,9 +526,8 @@ def _factorize_array( | |
def factorize( | ||
values, | ||
sort: bool = False, | ||
na_sentinel: int = -1, | ||
na_sentinel: Optional[int] = -1, | ||
size_hint: Optional[int] = None, | ||
dropna: bool = True, | ||
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: | ||
""" | ||
Encode the object as an enumerated type or categorical variable. | ||
|
@@ -541,8 +540,11 @@ def factorize( | |
Parameters | ||
---------- | ||
{values}{sort} | ||
na_sentinel : int, default -1 | ||
Value to mark "not found". | ||
na_sentinel : int or None, default -1 | ||
Value to mark "not found". If None, will not drop the NaN | ||
from the uniques of the values. | ||
|
||
.. versionchanged:: 1.1.2 | ||
{size_hint}\ | ||
|
||
Returns | ||
|
@@ -620,6 +622,22 @@ def factorize( | |
array([0, 0, 1]...) | ||
>>> uniques | ||
Index(['a', 'c'], dtype='object') | ||
|
||
If NaN is in the values, and we want to include NaN in the uniques of the | ||
values, it can be achieved by setting ``na_sentinel=None``. | ||
|
||
>>> values = np.array([1, 2, 1, np.nan]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the same example above this as well but w/o setting |
||
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 | ||
>>> codes | ||
array([ 0, 1, 0, -1]) | ||
>>> uniques | ||
array([1., 2.]) | ||
|
||
>>> codes, uniques = pd.factorize(values, na_sentinel=None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally put a blank line as othewise this is very hard to read There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done! |
||
>>> codes | ||
array([0, 1, 0, 2]) | ||
>>> uniques | ||
array([ 1., 2., nan]) | ||
""" | ||
# Implementation notes: This method is responsible for 3 things | ||
# 1.) coercing data to array-like (ndarray, Index, extension array) | ||
|
@@ -633,6 +651,13 @@ def factorize( | |
values = _ensure_arraylike(values) | ||
original = values | ||
|
||
# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques | ||
# of values, assign na_sentinel=-1 to replace code value for NaN. | ||
dropna = True | ||
if na_sentinel is None: | ||
na_sentinel = -1 | ||
dropna = False | ||
|
||
if is_extension_array_dtype(values.dtype): | ||
values = extract_array(values) | ||
codes, uniques = values.factorize(na_sentinel=na_sentinel) | ||
|
Uh oh!
There was an error while loading. Please reload this page.