Skip to content

Commit 557dfd5

Browse files
committed
WIP
1 parent e2dfa71 commit 557dfd5

File tree

5 files changed

+56
-13
lines changed

5 files changed

+56
-13
lines changed

pandas/core/algorithms.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,12 +733,16 @@ def factorize(
733733

734734
if not isinstance(values.dtype, np.dtype):
735735
# i.e. ExtensionDtype
736-
# assert dropna or sort
736+
# TODO: pass ignore_na=dropna. When sort is True we'll call safe_sort below
737+
# but it doesn't handle nulls; as such we want to ignore_na here and
738+
# append nulls on the end if necessary. Better would be having safe_sort
739+
# handle nulls.
737740
codes, uniques = values.factorize(
738741
na_sentinel=na_sentinel, ignore_na=dropna or sort
739742
)
740743
else:
741744
values = np.asarray(values) # convert DTA/TDA/MultiIndex
745+
# TODO: pass ignore_na=dropna; see above
742746
codes, uniques = factorize_array(
743747
values,
744748
na_sentinel=na_sentinel,
@@ -752,6 +756,7 @@ def factorize(
752756
)
753757

754758
if not dropna and sort:
759+
# TODO: Can remove if we pass ignore_na=dropna; see above
755760
code_is_na = codes == na_sentinel
756761
if code_is_na.any():
757762
# na_value is set based on the dtype of uniques, and compat set to False is

pandas/core/arrays/datetimelike.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1878,7 +1878,6 @@ def _with_freq(self, freq):
18781878

18791879
# --------------------------------------------------------------
18801880

1881-
# TODO: Fix?
18821881
def factorize(self, na_sentinel=-1, sort: bool = False, ignore_na: bool = True):
18831882
if self.freq is not None:
18841883
# We must be unique, so can short-circuit (and retain freq)

pandas/core/arrays/masked.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -881,17 +881,21 @@ def factorize(
881881
# check that factorize_array correctly preserves dtype.
882882
assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
883883

884-
size = len(uniques) if ignore_na else len(uniques) + 1
884+
# Make room for a null value if we're not ignoring it and it exists
885+
size = len(uniques) if ignore_na or not mask.any() else len(uniques) + 1
885886
uniques_mask = np.zeros(size, dtype=bool)
886887
if not ignore_na:
887888
na_index = mask.argmax()
888889
if mask[na_index]:
889890
# Insert na with the proper code
890-
# TODO: This only works with na_sentinel being -1
891-
na_code = codes[:na_index].argmax() + 1
892-
codes[codes >= na_code] += 1
891+
na_code = 0 if na_index == 0 else codes[:na_index].argmax() + 1
892+
if na_sentinel < 0:
893+
# codes can never equal na_sentinel and be >= na_code
894+
codes[codes >= na_code] += 1
895+
else:
896+
codes[(codes >= na_code) & (codes != na_sentinel)] += 1
893897
codes[codes == na_sentinel] = na_code
894-
# dummy value for uniques
898+
# dummy value for uniques; not used since uniques_mask will be True
895899
uniques = np.insert(uniques, na_code, 0)
896900
uniques_mask[na_code] = True
897901
uniques_ea = type(self)(uniques, uniques_mask)

pandas/core/arrays/sparse/array.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,9 @@ def factorize(
855855
# ExtensionArray.factorize -> Tuple[EA, EA]
856856
# Given that we have to return a dense array of codes, why bother
857857
# implementing an efficient factorize?
858-
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
858+
codes, uniques = algos.factorize(
859+
np.asarray(self), na_sentinel=na_sentinel, ignore_na=ignore_na
860+
)
859861
uniques_sp = SparseArray(uniques, dtype=self.dtype)
860862
return codes, uniques_sp
861863

pandas/tests/groupby/test_groupby_dropna.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -350,10 +350,43 @@ def test_groupby_nan_included():
350350
assert list(result.keys())[0:2] == ["g1", "g2"]
351351

352352

353-
def test_no_sort_keep_na():
354-
df = pd.DataFrame({"a": ["x", "y", None, "z"], "b": [1, 2, 3, 4]})
355-
gb = df.groupby("a", dropna=False, sort=False)
353+
@pytest.mark.parametrize(
354+
"index",
355+
[
356+
pd.Index([2, np.nan, 1, 2]),
357+
pd.Index([2, np.nan, 1, 2], dtype="UInt8"),
358+
pd.Index([2, np.nan, 1, 2], dtype="Int8"),
359+
pd.Index([2, np.nan, 1, 2], dtype="UInt16"),
360+
pd.Index([2, np.nan, 1, 2], dtype="Int16"),
361+
pd.Index([2, np.nan, 1, 2], dtype="UInt32"),
362+
pd.Index([2, np.nan, 1, 2], dtype="Int32"),
363+
pd.Index([2, np.nan, 1, 2], dtype="UInt64"),
364+
pd.Index([2, np.nan, 1, 2], dtype="Int64"),
365+
pd.Index([2, np.nan, 1, 2], dtype="Float32"),
366+
pd.Index([2, np.nan, 1, 2], dtype="Int64"),
367+
pd.Index([2, np.nan, 1, 2], dtype="Float64"),
368+
pd.Index(["y", np.nan, "x", "y"], dtype="category"),
369+
pd.Index(["y", pd.NA, "x", "y"], dtype="string"),
370+
pd.Index(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"),
371+
pd.Index(
372+
["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
373+
dtype="datetime64[ns]",
374+
),
375+
pd.Index(
376+
[
377+
pd.Period("2012-02-01", freq="D"),
378+
pd.NA,
379+
pd.Period("2012-01-01", freq="D"),
380+
pd.Period("2012-02-01", freq="D"),
381+
]
382+
),
383+
pd.Index(pd.arrays.SparseArray([2, np.nan, 1, 2])),
384+
],
385+
)
386+
def test_no_sort_keep_na(index):
387+
index.name = "key"
388+
df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=index)
389+
gb = df.groupby("key", dropna=False, sort=False)
356390
result = gb.sum()
357-
expected = pd.DataFrame({"b": [1, 2, 3, 4]}, index=["x", "y", None, "z"])
358-
expected.index.names = ["a"]
391+
expected = pd.DataFrame({"a": [5, 2, 3]}, index=index[:-1])
359392
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)