diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
index 2a115fb0b4fe3..c09e70915ef81 100644
--- a/asv_bench/benchmarks/replace.py
+++ b/asv_bench/benchmarks/replace.py
@@ -13,10 +13,25 @@ def setup(self, inplace):
         rng = pd.date_range("1/1/2000", periods=N, freq="min")
         data = np.random.randn(N)
         data[::2] = np.nan
-        self.ts = pd.Series(data, index=rng)
+        self.series = pd.Series(data, index=rng)
+        self.ts = pd.Series(rng.to_series())
+        self.ts[::2] = np.datetime64("nat")
+        self.df = pd.DataFrame(np.random.randn(10 ** 3, 10 ** 3))
 
     def time_fillna(self, inplace):
-        self.ts.fillna(0.0, inplace=inplace)
+        self.series.fillna(0.0, inplace=inplace)
+
+    def time_fillna_ts(self, inplace):
+        self.ts.fillna(np.datetime64("2021"), inplace=inplace)
+
+    def peakmem_fillna(self, inplace):
+        self.series.fillna(0.0, inplace=inplace)
+
+    def time_fillna_limit(self, inplace):
+        self.series.fillna(0.0, inplace=inplace, limit=10 ** 5)
+
+    def time_fillna_df(self, inplace):
+        self.df.fillna(0.0, inplace=inplace)
 
     def time_replace(self, inplace):
         self.ts.replace(np.nan, 0.0, inplace=inplace)
diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
index d0f664c323a89..6e52872fbf3e8 100644
--- a/pandas/_libs/algos.pyi
+++ b/pandas/_libs/algos.pyi
@@ -106,6 +106,41 @@ def backfill_2d_inplace(
     mask: np.ndarray,  # const uint8_t[:, :]
     limit=None,
 ) -> None: ...
+
+# ----------------------------------------------------------------------
+# Fillna
+# ----------------------------------------------------------------------
+# ctypedef fused fillna_t:
+#     float64_t
+#     float32_t
+#     object
+#     int64_t  # Datetime64
+#     uint16_t # Float 16
+#     complex64_t
+#     complex128_t
+# ctypedef fused fillna_values_t:
+#     algos_t
+#     uint16_t # Float 16
+#     complex64_t
+#     complex128_t
+def fillna1d(
+    arr: np.ndarray,  # fillna_t[:]
+    value: object,  # fillna_t
+    limit: int,
+    inf_as_na: bool = False,
+) -> None: ...
+def fillna1d_multi_values(
+    arr: np.ndarray,  # fillna_t[:]
+    value: np.ndarray,  # fillna_values_t
+    limit: int,
+    inf_as_na: bool = False,
+) -> None: ...
+def fillna2d(
+    arr: np.ndarray,  # fillna_t[:]
+    value: object,  # fillna_t
+    limit: int,
+    inf_as_na: bool = False,
+) -> None: ...
 def is_monotonic(
     arr: np.ndarray,  # ndarray[algos_t, ndim=1]
     timelike: bool,
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
index 4efc30e40654c..c4c29d1130331 100644
--- a/pandas/_libs/algos.pyx
+++ b/pandas/_libs/algos.pyx
@@ -26,6 +26,8 @@ from numpy cimport (
     NPY_UINT16,
     NPY_UINT32,
     NPY_UINT64,
+    complex64_t,
+    complex128_t,
     float32_t,
     float64_t,
     int8_t,
@@ -52,6 +54,10 @@ from pandas._libs.khash cimport (
     kh_resize_int64,
     khiter_t,
 )
+from pandas._libs.missing cimport (
+    checknull,
+    checknull_old,
+)
 from pandas._libs.util cimport (
     get_nat,
     numeric,
@@ -62,7 +68,13 @@ import pandas._libs.missing as missing
 cdef:
     float64_t FP_ERR = 1e-13
     float64_t NaN = <float64_t>np.NaN
+    # Numpy Float 16 is actually Uint16 since most compilers don't support halfs
+    # We use this value in fillna to fill float16 nans
+    # https://docs.scipy.org/doc/numpy-1.13.0/reference/c-api.coremath.html#half-precision-functions
+    uint16_t uNaN = np.float16(np.nan).view(np.uint16)
     int64_t NPY_NAT = get_nat()
+    float64_t INF = <float64_t>np.inf
+    float64_t NEGINF = -INF
 
 cdef enum TiebreakEnumType:
     TIEBREAK_AVERAGE
@@ -832,6 +844,193 @@ def backfill_2d_inplace(algos_t[:, :] values,
     pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)
 
 
+# Fillna logic
+# We have our own fused type instead of algos_t
+# since we don't need to support types that can't hold NAs(ints, etc)
+ctypedef fused fillna_t:
+    float64_t
+    float32_t
+    object
+    int8_t  # Categorical
+    int16_t  # Categorical
+    int32_t  # Categorical
+    int64_t   # Categorical/Datetime64
+    uint16_t  # Float 16
+    complex64_t
+    complex128_t
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def fillna1d(fillna_t[:] arr,
+             fillna_t value,
+             Py_ssize_t limit,
+             bint inf_as_na=False
+             ) -> ndarray:
+    """
+    Fills na-like elements inplace for a 1D array
+
+    Parameters
+    ----------
+    arr : ndarray
+    value : object
+        The value to use to replace nans
+    limit : int, default None
+        The number of elements to fill. If None, fills all NaN values
+    inf_as_na:
+        Whether to consider INF and NEGINF as NA
+    """
+    cdef:
+        Py_ssize_t i, N, lim
+        Py_ssize_t count=0
+        fillna_t val
+        bint result
+
+    assert arr.ndim == 1, "'arr' must be 1-D."
+
+    N = len(arr)
+    for i in range(N):
+        val = arr[i]
+        if fillna_t is object:
+            if inf_as_na:
+                result = checknull_old(val)
+            else:
+                result = checknull(val)
+        elif fillna_t is int64_t:
+            # Datetime64/Timedelta64
+            result = val == NPY_NAT
+        elif fillna_t is uint16_t:
+            # Float 16
+            result = val == uNaN
+        else:
+            result = val != val
+            if inf_as_na:
+                result = result and (val == INF or val == NEGINF)
+        if result and count < limit:
+            arr[i] = value
+            count+=1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def fillna1d_multi_values(fillna_t[:] arr,
+                          algos_t[:] value,
+                          Py_ssize_t limit,
+                          bint inf_as_na=False
+                          ) -> ndarray:
+    """
+    Fills na-like elements inplace for a 1D array
+
+    Parameters
+    ----------
+    arr : ndarray
+    value : ndarray/ExtensionArray
+        A ndarray/ExtensionArray with same length as arr
+        describing which fill value to use at each position,
+        with a value of np.nan indicating that a position should
+        not be filled
+    limit : int, default None
+        The number of elements to fill. If None, fills all NaN values
+    inf_as_na:
+        Whether to consider INF and NEGINF as NA
+    """
+    cdef:
+        Py_ssize_t i, N
+        Py_ssize_t count=0
+        fillna_t val
+        algos_t fill_value
+        bint result
+
+    assert arr.ndim == 1, "'arr' must be 1-D."
+
+    N = len(arr)
+    for i in range(N):
+        fill_value = value[i]
+        if algos_t is object or algos_t is float64_t or algos_t is float32_t:
+            if fill_value != fill_value:
+                # np.nan don't fill
+                continue
+        val = arr[i]
+        if fillna_t is object:
+            if inf_as_na:
+                result = checknull_old(val)
+            else:
+                result = checknull(val)
+        elif fillna_t is int64_t:
+            # Datetime64/Timedelta64
+            result = val == NPY_NAT
+        elif fillna_t is uint16_t:
+            # Float 16
+            result = val == uNaN
+        else:
+            result = val != val
+            if inf_as_na:
+                result = result and (val == INF or val == NEGINF)
+        if result and count < limit:
+            # Ugh... We have to cast here since technically could have a int64->float32
+            # There shouldn't be any risk here since BlockManager should check
+            # that the element can be held
+            arr[i] = <fillna_t>fill_value
+            count+=1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def fillna2d(fillna_t[:, :] arr,
+             fillna_t value,
+             Py_ssize_t limit,
+             bint inf_as_na=False
+             ) -> ndarray:
+    """
+    Fills na-like elements inplace for a 2D array
+
+    Parameters
+    ----------
+    arr : ndarray
+    value : object
+        The value to use to replace nans
+    limit : int, default None
+        The number of elements to fill. If None, fills all NaN values
+    inf_as_na:
+        Whether to consider INF and NEGINF as NA
+    """
+    cdef:
+        Py_ssize_t i, j, n, m
+        Py_ssize_t count=0
+        fillna_t val
+        bint result
+
+    assert arr.ndim == 2, "'arr' must be 2-D."
+
+    n, m = (<object>arr).shape
+    if inf_as_na:
+        check_func = checknull_old
+    else:
+        check_func = checknull
+    for i in range(n):
+        count = 0  # Limit is per axis
+        for j in range(m):
+            val = arr[i, j]
+            if fillna_t is object:
+                if inf_as_na:
+                    result = checknull_old(val)
+                else:
+                    result = checknull(val)
+            elif fillna_t is int64_t:
+                # Datetime64/Timedelta64
+                result = val == NPY_NAT
+            elif fillna_t is uint16_t:
+                # Float 16
+                result = val == uNaN
+            else:
+                result = val != val
+                if inf_as_na:
+                    result = result and (val == INF or val == NEGINF)
+            if result and count < limit:
+                arr[i, j] = value
+                count+=1
+
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike):
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 237d06402a0ee..12faec2bdc9a4 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -50,6 +50,7 @@
     is_dtype_equal,
     is_extension_array_dtype,
     is_list_like,
+    is_scalar,
     is_sparse,
     is_string_dtype,
     pandas_dtype,
@@ -434,40 +435,68 @@ def fillna(
         fillna on the block with the value. If we fail, then convert to
         ObjectBlock and try again
         """
+        # TODO: Handle inf_as_na, we need to get option and pass to cython funcs
         inplace = validate_bool_kwarg(inplace, "inplace")
+        blk = self if inplace else self.copy()
+        nobs = len(self) if self.ndim == 1 else self.shape[-1]
+        limit = libalgos.validate_limit(nobs if nobs > 0 else None, limit=limit)
 
-        mask = isna(self.values)
-        mask, noop = validate_putmask(self.values, mask)
-
-        if limit is not None:
-            limit = libalgos.validate_limit(None, limit=limit)
-            mask[mask.cumsum(self.ndim - 1) > limit] = False
+        if not self._can_hold_na or nobs == 0:
+            return [blk]
 
-        if not self._can_hold_na:
-            if inplace:
-                return [self]
+        if not self.is_extension:
+            if self._can_hold_element(value):
+                if blk.dtype == np.float16:
+                    # Float16 not supported by compiler, use view as uint16 hack
+                    arr = blk.values.view(np.uint16)
+                    if is_scalar(value):
+                        value = np.float16(value).view(np.uint16)
+                    else:
+                        value = value.astype(np.float16).view(np.uint16)
+                else:
+                    arr = blk.values
+                if self.ndim == 1:
+                    if is_list_like(value):
+                        # TODO: Verify EA case
+                        if is_extension_array_dtype(value):
+                            mask = value.isna()
+                            value = np.asarray(value[mask], dtype=object)
+                            libalgos.fillna1d_multi_values(
+                                arr[mask], value=value, limit=limit
+                            )
+                        else:
+                            libalgos.fillna1d_multi_values(
+                                arr, value=value, limit=limit
+                            )
+                    else:
+                        libalgos.fillna1d(arr, value=value, limit=limit)
+                else:
+                    libalgos.fillna2d(arr, value=value, limit=limit)
+                return blk._maybe_downcast([blk], downcast)
+            elif self.ndim == 1 or self.shape[0] == 1:
+                coerced_blk = self.coerce_to_target_dtype(value)
+                # bc we have already cast, inplace=True may avoid an extra copy
+                return coerced_blk.fillna(
+                    value, limit=limit, inplace=True, downcast=None
+                )
             else:
-                return [self.copy()]
-
-        if self._can_hold_element(value):
-            nb = self if inplace else self.copy()
-            putmask_inplace(nb.values, mask, value)
-            return nb._maybe_downcast([nb], downcast)
-
-        if noop:
-            # we can't process the value, but nothing to do
-            return [self] if inplace else [self.copy()]
-
-        elif self.ndim == 1 or self.shape[0] == 1:
-            blk = self.coerce_to_target_dtype(value)
-            # bc we have already cast, inplace=True may avoid an extra copy
-            return blk.fillna(value, limit=limit, inplace=True, downcast=None)
-
+                # operate column-by-column
+                return self.split_and_operate(
+                    type(self).fillna,
+                    value,
+                    limit=limit,
+                    inplace=inplace,
+                    downcast=None,
+                )
         else:
-            # operate column-by-column
-            return self.split_and_operate(
-                type(self).fillna, value, limit=limit, inplace=inplace, downcast=None
-            )
+            # TODO: This seems to work for EAS, verify it does
+            return [
+                self.make_block_same_class(
+                    values=cast(ExtensionArray, self.values).fillna(
+                        value=value, limit=limit
+                    )
+                )
+            ]
 
     @final
     def _split(self) -> list[Block]: