WIP

rhshadrach · rhshadrach · commit 557dfd5d1048 · 2022-04-18T18:35:59.000-04:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -733,12 +733,16 @@ def factorize(
 
     if not isinstance(values.dtype, np.dtype):
         # i.e. ExtensionDtype
-        # assert dropna or sort
+        # TODO: pass ignore_na=dropna. When sort is True we'll call safe_sort below
+        #       but it doesn't handle nulls; as such we want to ignore_na here and
+        #       append nulls on the end if necessary. Better would be having safe_sort
+        #       handle nulls.
         codes, uniques = values.factorize(
             na_sentinel=na_sentinel, ignore_na=dropna or sort
         )
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
+        # TODO: pass ignore_na=dropna; see above
         codes, uniques = factorize_array(
             values,
             na_sentinel=na_sentinel,
@@ -752,6 +756,7 @@ def factorize(
         )
 
     if not dropna and sort:
+        # TODO: Can remove if we pass ignore_na=dropna; see above
         code_is_na = codes == na_sentinel
         if code_is_na.any():
             # na_value is set based on the dtype of uniques, and compat set to False is
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -1878,7 +1878,6 @@ def _with_freq(self, freq):
 
     # --------------------------------------------------------------
 
-    # TODO: Fix?
     def factorize(self, na_sentinel=-1, sort: bool = False, ignore_na: bool = True):
         if self.freq is not None:
             # We must be unique, so can short-circuit (and retain freq)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -881,17 +881,21 @@ def factorize(
         # check that factorize_array correctly preserves dtype.
         assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
 
-        size = len(uniques) if ignore_na else len(uniques) + 1
+        # Make room for a null value if we're not ignoring it and it exists
+        size = len(uniques) if ignore_na or not mask.any() else len(uniques) + 1
         uniques_mask = np.zeros(size, dtype=bool)
         if not ignore_na:
             na_index = mask.argmax()
             if mask[na_index]:
                 # Insert na with the proper code
-                # TODO: This only works with na_sentinel being -1
-                na_code = codes[:na_index].argmax() + 1
-                codes[codes >= na_code] += 1
+                na_code = 0 if na_index == 0 else codes[:na_index].argmax() + 1
+                if na_sentinel < 0:
+                    # codes can never equal na_sentinel and be >= na_code
+                    codes[codes >= na_code] += 1
+                else:
+                    codes[(codes >= na_code) & (codes != na_sentinel)] += 1
                 codes[codes == na_sentinel] = na_code
-                # dummy value for uniques
+                # dummy value for uniques; not used since uniques_mask will be True
                 uniques = np.insert(uniques, na_code, 0)
                 uniques_mask[na_code] = True
         uniques_ea = type(self)(uniques, uniques_mask)
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -855,7 +855,9 @@ def factorize(
         # ExtensionArray.factorize -> Tuple[EA, EA]
         # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        codes, uniques = algos.factorize(
+            np.asarray(self), na_sentinel=na_sentinel, ignore_na=ignore_na
+        )
         uniques_sp = SparseArray(uniques, dtype=self.dtype)
         return codes, uniques_sp
 
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -350,10 +350,43 @@ def test_groupby_nan_included():
     assert list(result.keys())[0:2] == ["g1", "g2"]
 
 
-def test_no_sort_keep_na():
-    df = pd.DataFrame({"a": ["x", "y", None, "z"], "b": [1, 2, 3, 4]})
-    gb = df.groupby("a", dropna=False, sort=False)
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.Index([2, np.nan, 1, 2]),
+        pd.Index([2, np.nan, 1, 2], dtype="UInt8"),
+        pd.Index([2, np.nan, 1, 2], dtype="Int8"),
+        pd.Index([2, np.nan, 1, 2], dtype="UInt16"),
+        pd.Index([2, np.nan, 1, 2], dtype="Int16"),
+        pd.Index([2, np.nan, 1, 2], dtype="UInt32"),
+        pd.Index([2, np.nan, 1, 2], dtype="Int32"),
+        pd.Index([2, np.nan, 1, 2], dtype="UInt64"),
+        pd.Index([2, np.nan, 1, 2], dtype="Int64"),
+        pd.Index([2, np.nan, 1, 2], dtype="Float32"),
+        pd.Index([2, np.nan, 1, 2], dtype="Int64"),
+        pd.Index([2, np.nan, 1, 2], dtype="Float64"),
+        pd.Index(["y", np.nan, "x", "y"], dtype="category"),
+        pd.Index(["y", pd.NA, "x", "y"], dtype="string"),
+        pd.Index(["y", pd.NA, "x", "y"], dtype="string[pyarrow]"),
+        pd.Index(
+            ["2016-01-01", np.datetime64("NaT"), "2017-01-01", "2016-01-01"],
+            dtype="datetime64[ns]",
+        ),
+        pd.Index(
+            [
+                pd.Period("2012-02-01", freq="D"),
+                pd.NA,
+                pd.Period("2012-01-01", freq="D"),
+                pd.Period("2012-02-01", freq="D"),
+            ]
+        ),
+        pd.Index(pd.arrays.SparseArray([2, np.nan, 1, 2])),
+    ],
+)
+def test_no_sort_keep_na(index):
+    index.name = "key"
+    df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=index)
+    gb = df.groupby("key", dropna=False, sort=False)
     result = gb.sum()
-    expected = pd.DataFrame({"b": [1, 2, 3, 4]}, index=["x", "y", None, "z"])
-    expected.index.names = ["a"]
+    expected = pd.DataFrame({"a": [5, 2, 3]}, index=index[:-1])
     tm.assert_frame_equal(result, expected)