diff --git a/ci/deps/actions-37-db-min.yaml b/ci/deps/actions-37-db-min.yaml
index 1d3794576220a..0dfc806f4b631 100644
--- a/ci/deps/actions-37-db-min.yaml
+++ b/ci/deps/actions-37-db-min.yaml
@@ -27,6 +27,7 @@ dependencies:
   - lxml=4.3.0
   - matplotlib
   - nomkl
+  - numba
   - numexpr
   - openpyxl
   - pandas-gbq
diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml
index 5381caaa242cf..75f934ec4690e 100644
--- a/ci/deps/actions-37-db.yaml
+++ b/ci/deps/actions-37-db.yaml
@@ -24,6 +24,7 @@ dependencies:
   - moto>=1.3.14
   - flask
   - nomkl
+  - numba
   - numexpr
   - numpy=1.16.*
   - odfpy
diff --git a/ci/deps/actions-37-locale_slow.yaml b/ci/deps/actions-37-locale_slow.yaml
index d9ad1f538908e..5433f01b9bc35 100644
--- a/ci/deps/actions-37-locale_slow.yaml
+++ b/ci/deps/actions-37-locale_slow.yaml
@@ -17,6 +17,7 @@ dependencies:
   - bottleneck=1.2.*
   - lxml
   - matplotlib=3.0.0
+  - numba
   - numpy=1.16.*
   - openpyxl=3.0.0
   - python-dateutil
diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml
index 61f431256dd4a..430e6fabbe237 100644
--- a/ci/deps/actions-37.yaml
+++ b/ci/deps/actions-37.yaml
@@ -15,6 +15,7 @@ dependencies:
   # pandas dependencies
   - botocore>=1.11
   - fsspec>=0.7.4
+  - numba
   - numpy
   - python-dateutil
   - nomkl
diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml
index 629804c71e726..1d22ad1c1b18c 100644
--- a/ci/deps/actions-38-locale.yaml
+++ b/ci/deps/actions-38-locale.yaml
@@ -23,6 +23,7 @@ dependencies:
   - matplotlib <3.3.0
   - moto
   - nomkl
+  - numba
   - numexpr
   - numpy<1.20  # GH#39541 compat with pyarrow<3
   - openpyxl
diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml
index e2660d07c3558..2607a943c434b 100644
--- a/ci/deps/actions-38.yaml
+++ b/ci/deps/actions-38.yaml
@@ -13,6 +13,7 @@ dependencies:
   - hypothesis>=3.58.0
 
   # pandas dependencies
+  - numba
   - numpy
   - python-dateutil
   - nomkl
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
index 36e8bf528fc3e..d6fda472c5a32 100644
--- a/ci/deps/actions-39.yaml
+++ b/ci/deps/actions-39.yaml
@@ -12,6 +12,7 @@ dependencies:
   - hypothesis>=3.58.0
 
   # pandas dependencies
+  - numba
   - numpy
   - python-dateutil
   - pytz
diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml
index d667adddda859..e0f6088e9f093 100644
--- a/ci/deps/azure-macos-37.yaml
+++ b/ci/deps/azure-macos-37.yaml
@@ -18,6 +18,7 @@ dependencies:
   - lxml
   - matplotlib=2.2.3
   - nomkl
+  - numba
   - numexpr
   - numpy=1.16.5
   - openpyxl
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
index e7ac4c783b855..a257fa6d27ae4 100644
--- a/ci/deps/azure-windows-37.yaml
+++ b/ci/deps/azure-windows-37.yaml
@@ -23,6 +23,7 @@ dependencies:
   - matplotlib=2.2.*
   - moto>=1.3.14
   - flask
+  - numba
   - numexpr
   - numpy=1.16.*
   - openpyxl
diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml
index 8df6104f43a50..3ea1bc89af523 100644
--- a/ci/deps/travis-37-arm64.yaml
+++ b/ci/deps/travis-37-arm64.yaml
@@ -12,6 +12,7 @@ dependencies:
 
   # pandas dependencies
   - botocore>=1.11
+  - numba
   - numpy
   - python-dateutil
   - pytz
diff --git a/pandas/_libs_numba/__init__.py b/pandas/_libs_numba/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/_libs_numba/algos.py b/pandas/_libs_numba/algos.py
new file mode 100644
index 0000000000000..b191131c2e914
--- /dev/null
+++ b/pandas/_libs_numba/algos.py
@@ -0,0 +1,1593 @@
+from __future__ import annotations
+
+import numba
+
+# from numba import (
+#     float32,
+#     float64,
+#     int8,
+#     int16,
+#     int32,
+#     int64,
+#     intp,
+#     types,
+#     uint8,
+#     void,
+# )
+import numpy as np
+
+import pandas._libs_numba.util as util
+
+# import cython
+# from cython import Py_ssize_t
+
+# from libc.math cimport fabs, sqrt
+# from libc.stdlib cimport free, malloc
+# from libc.string cimport memmove
+
+# cimport numpy as cnp
+# from numpy cimport (
+#     NPY_FLOAT32,
+#     NPY_FLOAT64,
+#     NPY_INT8,
+#     NPY_INT16,
+#     NPY_INT32,
+#     NPY_INT64,
+#     NPY_OBJECT,
+#     NPY_UINT8,
+#     NPY_UINT16,
+#     NPY_UINT32,
+#     NPY_UINT64,
+#     float32_t,
+#     float64_t,
+#     int8_t,
+#     int16_t,
+#     int32_t,
+#     int64_t,
+#     intp_t,
+#     ndarray,
+#     uint8_t,
+#     uint16_t,
+#     uint32_t,
+#     uint64_t,
+# )
+
+# cnp.import_array()
+
+
+# from pandas._libs.khash cimport (
+#     kh_destroy_int64,
+#     kh_get_int64,
+#     kh_init_int64,
+#     kh_int64_t,
+#     kh_put_int64,
+#     kh_resize_int64,
+#     khiter_t,
+# )
+# from pandas._libs.util cimport get_nat, numeric
+
+# import pandas._libs.missing as missing
+
+# cdef:
+#     float64_t FP_ERR = 1e-13
+#     float64_t NaN = <float64_t>np.NaN
+#     int64_t NPY_NAT = get_nat()
+
+# tiebreakers = {
+#     "average": TIEBREAK_AVERAGE,
+#     "min": TIEBREAK_MIN,
+#     "max": TIEBREAK_MAX,
+#     "first": TIEBREAK_FIRST,
+#     "dense": TIEBREAK_DENSE,
+# }
+
+
+# cdef inline bint are_diff(object left, object right):
+#     try:
+#         return fabs(left - right) > FP_ERR
+#     except TypeError:
+#         return left != right
+
+
+# class Infinity:
+#     """
+#     Provide a positive Infinity comparison method for ranking.
+#     """
+#     __lt__ = lambda self, other: False
+#     __le__ = lambda self, other: isinstance(other, Infinity)
+#     __eq__ = lambda self, other: isinstance(other, Infinity)
+#     __ne__ = lambda self, other: not isinstance(other, Infinity)
+#     __gt__ = lambda self, other: (not isinstance(other, Infinity) and
+#                                   not missing.checknull(other))
+#     __ge__ = lambda self, other: not missing.checknull(other)
+
+
+# class NegInfinity:
+#     """
+#     Provide a negative Infinity comparison method for ranking.
+#     """
+#     __lt__ = lambda self, other: (not isinstance(other, NegInfinity) and
+#                                   not missing.checknull(other))
+#     __le__ = lambda self, other: not missing.checknull(other)
+#     __eq__ = lambda self, other: isinstance(other, NegInfinity)
+#     __ne__ = lambda self, other: not isinstance(other, NegInfinity)
+#     __gt__ = lambda self, other: False
+#     __ge__ = lambda self, other: isinstance(other, NegInfinity)
+
+
+@numba.njit
+def unique_deltas(arr: np.ndarray) -> np.ndarray:
+    """
+    Efficiently find the unique first-differences of the given array.
+
+    Parameters
+    ----------
+    arr : ndarray[in64_t]
+
+    Returns
+    -------
+    ndarray[int64_t, ndim=1]
+        An ordered ndarray[int64_t]
+    """
+    n = len(arr)
+    uniques = []
+    seen = set()
+
+    for i in range(n - 1):
+        val = arr[i + 1] - arr[i]
+        if val not in seen:
+            seen.add(val)
+            uniques.append(val)
+
+    result = np.array(uniques, dtype=np.int64)
+    result.sort()
+    return result
+
+
+def is_lexsorted(list_of_arrays: list[np.ndarray]) -> bool:
+    nlevels = len(list_of_arrays)
+    n = len(list_of_arrays[0])
+    arr = np.concatenate(list_of_arrays)
+    arr = arr.reshape(nlevels, n)
+    return _is_lexsorted(arr)
+
+
+@numba.njit
+def _is_lexsorted(vecs: np.ndarray) -> bool:
+    result = True
+    nlevels, n = vecs.shape
+
+    for i in range(1, n):
+        for k in range(nlevels):
+            cur = vecs[k, i]
+            pre = vecs[k, i - 1]
+            if cur == pre:
+                continue
+            elif cur > pre:
+                break
+            else:
+                result = False
+                break
+
+    return result
+
+
+@numba.njit
+def groupsort_indexer(index: np.ndarray, ngroups: int) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Compute a 1-d indexer.
+
+    The indexer is an ordering of the passed index,
+    ordered by the groups.
+
+    Parameters
+    ----------
+    index: ndarray
+        Mappings from group -> position.
+    ngroups: int
+        Number of groups.
+
+    Returns
+    -------
+    tuple
+        1-d indexer ordered by groups, group counts.
+
+    Notes
+    -----
+    This is a reverse of the label factorization process.
+    """
+    counts = np.zeros(ngroups + 1, dtype=np.int64)
+    n = len(index)
+    result = np.zeros(n, dtype=np.int64)
+    where = np.zeros(ngroups + 1, dtype=np.int64)
+
+    # count group sizes, location 0 for NA
+    for i in range(n):
+        counts[index[i] + 1] += 1
+
+    # mark the start of each contiguous group of like-indexed data
+    for i in range(1, ngroups + 1):
+        where[i] = where[i - 1] + counts[i - 1]
+
+    # this is our indexer
+    for i in range(n):
+        label = index[i] + 1
+        result[where[label]] = i
+        where[label] += 1
+
+    return result, counts
+
+
+@numba.njit
+def kth_smallest(a: np.ndarray, k):
+    n = a.shape[0]
+
+    l = 0
+    m = n - 1
+
+    while l < m:
+        x = a[k]
+        i = l
+        j = m
+
+        while 1:
+            while a[i] < x:
+                i += 1
+            while x < a[j]:
+                j -= 1
+            if i <= j:
+                a[i], a[j] = a[j], a[i]
+                i += 1
+                j -= 1
+
+            if i > j:
+                break
+
+        if j < k:
+            l = i
+        if k < i:
+            m = j
+    return a[k]
+
+
+# # ----------------------------------------------------------------------
+# # Pairwise correlation/covariance
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
+#     cdef:
+#         Py_ssize_t i, j, xi, yi, N, K
+#         bint minpv
+#         ndarray[float64_t, ndim=2] result
+#         ndarray[uint8_t, ndim=2] mask
+#         int64_t nobs = 0
+#         float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx
+#         float64_t ssqdmy, covxy
+
+#     N, K = (<object>mat).shape
+
+#     if minp is None:
+#         minpv = 1
+#     else:
+#         minpv = <int>minp
+
+#     result = np.empty((K, K), dtype=np.float64)
+#     mask = np.isfinite(mat).view(np.uint8)
+
+#     with nogil:
+#         for xi in range(K):
+#             for yi in range(xi + 1):
+#                 # Welford's method for the variance-calculation
+#                 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+#                 nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
+#                 for i in range(N):
+#                     if mask[i, xi] and mask[i, yi]:
+#                         vx = mat[i, xi]
+#                         vy = mat[i, yi]
+#                         nobs += 1
+#                         prev_meanx = meanx
+#                         prev_meany = meany
+#                         meanx = meanx + 1 / nobs * (vx - meanx)
+#                         meany = meany + 1 / nobs * (vy - meany)
+#                         ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
+#                         ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
+#                         covxy = covxy + (vx - meanx) * (vy - prev_meany)
+
+#                 if nobs < minpv:
+#                     result[xi, yi] = result[yi, xi] = NaN
+#                 else:
+#                     divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
+
+#                     if divisor != 0:
+#                         result[xi, yi] = result[yi, xi] = covxy / divisor
+#                     else:
+#                         result[xi, yi] = result[yi, xi] = NaN
+
+#     return result
+
+# # ----------------------------------------------------------------------
+# # Pairwise Spearman correlation
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
+#     cdef:
+#         Py_ssize_t i, j, xi, yi, N, K
+#         ndarray[float64_t, ndim=2] result
+#         ndarray[float64_t, ndim=2] ranked_mat
+#         ndarray[float64_t, ndim=1] maskedx
+#         ndarray[float64_t, ndim=1] maskedy
+#         ndarray[uint8_t, ndim=2] mask
+#         int64_t nobs = 0
+#         float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
+#         const int64_t[:] labels_n, labels_nobs
+
+#     N, K = (<object>mat).shape
+#     # For compatibility when calling rank_1d
+#     labels_n = np.zeros(N, dtype=np.int64)
+
+#     result = np.empty((K, K), dtype=np.float64)
+#     mask = np.isfinite(mat).view(np.uint8)
+
+#     ranked_mat = np.empty((N, K), dtype=np.float64)
+
+#     for i in range(K):
+#         ranked_mat[:, i] = rank_1d(mat[:, i], labels=labels_n)
+
+#     for xi in range(K):
+#         for yi in range(xi + 1):
+#             nobs = 0
+#             # Keep track of whether we need to recompute ranks
+#             all_ranks = True
+#             for i in range(N):
+#                 all_ranks &= not (mask[i, xi] ^ mask[i, yi])
+#                 if mask[i, xi] and mask[i, yi]:
+#                     nobs += 1
+
+#             if nobs < minp:
+#                 result[xi, yi] = result[yi, xi] = NaN
+#             else:
+#                 maskedx = np.empty(nobs, dtype=np.float64)
+#                 maskedy = np.empty(nobs, dtype=np.float64)
+#                 j = 0
+
+#                 for i in range(N):
+#                     if mask[i, xi] and mask[i, yi]:
+#                         maskedx[j] = ranked_mat[i, xi]
+#                         maskedy[j] = ranked_mat[i, yi]
+#                         j += 1
+
+#                 if not all_ranks:
+#                     labels_nobs = np.zeros(nobs, dtype=np.int64)
+#                     maskedx = rank_1d(maskedx, labels=labels_nobs)
+#                     maskedy = rank_1d(maskedy, labels=labels_nobs)
+
+#                 mean = (nobs + 1) / 2.
+
+#                 # now the cov numerator
+#                 sumx = sumxx = sumyy = 0
+
+#                 for i in range(nobs):
+#                     vx = maskedx[i] - mean
+#                     vy = maskedy[i] - mean
+
+#                     sumx += vx * vy
+#                     sumxx += vx * vx
+#                     sumyy += vy * vy
+
+#                 divisor = sqrt(sumxx * sumyy)
+
+#                 if divisor != 0:
+#                     result[xi, yi] = result[yi, xi] = sumx / divisor
+#                 else:
+#                     result[xi, yi] = result[yi, xi] = NaN
+
+#     return result
+
+
+# # ----------------------------------------------------------------------
+# # Kendall correlation
+# # Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient  # noqa
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
+#     """
+#     Perform kendall correlation on a 2d array
+
+#     Parameters
+#     ----------
+#     mat : np.ndarray[float64_t, ndim=2]
+#         Array to compute kendall correlation on
+#     minp : int, default 1
+#         Minimum number of observations required per pair of columns
+#         to have a valid result.
+
+#     Returns
+#     -------
+#     numpy.ndarray[float64_t, ndim=2]
+#         Correlation matrix
+#     """
+#     cdef:
+#         Py_ssize_t i, j, k, xi, yi, N, K
+#         ndarray[float64_t, ndim=2] result
+#         ndarray[float64_t, ndim=2] ranked_mat
+#         ndarray[uint8_t, ndim=2] mask
+#         float64_t currj
+#         ndarray[uint8_t, ndim=1] valid
+#         ndarray[int64_t] sorted_idxs
+#         ndarray[float64_t, ndim=1] col
+#         int64_t n_concordant
+#         int64_t total_concordant = 0
+#         int64_t total_discordant = 0
+#         float64_t kendall_tau
+#         int64_t n_obs
+#         const int64_t[:] labels_n
+
+#     N, K = (<object>mat).shape
+
+#     result = np.empty((K, K), dtype=np.float64)
+#     mask = np.isfinite(mat)
+
+#     ranked_mat = np.empty((N, K), dtype=np.float64)
+#     # For compatibility when calling rank_1d
+#     labels_n = np.zeros(N, dtype=np.int64)
+
+#     for i in range(K):
+#         ranked_mat[:, i] = rank_1d(mat[:, i], labels_n)
+
+#     for xi in range(K):
+#         sorted_idxs = ranked_mat[:, xi].argsort()
+#         ranked_mat = ranked_mat[sorted_idxs]
+#         mask = mask[sorted_idxs]
+#         for yi in range(xi + 1, K):
+#             valid = mask[:, xi] & mask[:, yi]
+#             if valid.sum() < minp:
+#                 result[xi, yi] = NaN
+#                 result[yi, xi] = NaN
+#             else:
+#                 # Get columns and order second column using 1st column ranks
+#                 if not valid.all():
+#                     col = ranked_mat[valid.nonzero()][:, yi]
+#                 else:
+#                     col = ranked_mat[:, yi]
+#                 n_obs = col.shape[0]
+#                 total_concordant = 0
+#                 total_discordant = 0
+#                 for j in range(n_obs - 1):
+#                     currj = col[j]
+#                     # Count num concordant and discordant pairs
+#                     n_concordant = 0
+#                     for k in range(j, n_obs):
+#                         if col[k] > currj:
+#                             n_concordant += 1
+#                     total_concordant += n_concordant
+#                     total_discordant += (n_obs - 1 - j - n_concordant)
+#                 # Note: we do total_concordant+total_discordant here which is
+#                 # equivalent to the C(n, 2), the total # of pairs,
+#                 # listed on wikipedia
+#                 kendall_tau = (total_concordant - total_discordant) / \
+#                               (total_concordant + total_discordant)
+#                 result[xi, yi] = kendall_tau
+#                 result[yi, xi] = kendall_tau
+
+#         if mask[:, xi].sum() > minp:
+#             result[xi, xi] = 1
+#         else:
+#             result[xi, xi] = NaN
+
+#     return result
+
+
+# ----------------------------------------------------------------------
+
+
+def validate_limit(limit: int | None = None) -> None:
+    """
+    Check that the `limit` argument is a positive integer or None.
+    """
+    if limit is None:
+        return
+    elif not util.is_integer_object(limit):
+        raise ValueError("Limit must be an integer")
+    elif limit < 1:
+        raise ValueError("Limit must be greater than 0")
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None):
+#     cdef:
+#         Py_ssize_t i, j, nleft, nright
+#         ndarray[int64_t, ndim=1] indexer
+#         algos_t cur, next_val
+#         int lim, fill_count = 0
+
+#     nleft = len(old)
+#     nright = len(new)
+#     indexer = np.empty(nright, dtype=np.int64)
+#     indexer[:] = -1
+
+#     lim = validate_limit(nright, limit)
+
+#     if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+#         return indexer
+
+#     i = j = 0
+
+#     cur = old[0]
+
+#     while j <= nright - 1 and new[j] < cur:
+#         j += 1
+
+#     while True:
+#         if j == nright:
+#             break
+
+#         if i == nleft - 1:
+#             while j < nright:
+#                 if new[j] == cur:
+#                     indexer[j] = i
+#                 elif new[j] > cur and fill_count < lim:
+#                     indexer[j] = i
+#                     fill_count += 1
+#                 j += 1
+#             break
+
+#         next_val = old[i + 1]
+
+#         while j < nright and cur <= new[j] < next_val:
+#             if new[j] == cur:
+#                 indexer[j] = i
+#             elif fill_count < lim:
+#                 indexer[j] = i
+#                 fill_count += 1
+#             j += 1
+
+#         fill_count = 0
+#         i += 1
+#         cur = next_val
+
+#     return indexer
+
+
+def pad_inplace(values: np.ndarray, mask: np.ndarray, limit: int | None = None) -> None:
+    validate_limit(limit)
+    _pad_inplace(values, mask, limit)
+
+
+@numba.jit
+def _pad_inplace(
+    values: np.ndarray, mask: np.ndarray, limit: int | None = None
+) -> None:
+    if values.shape[0]:
+        if limit is None:
+            _pad_inplace_no_limit(values, mask)
+        else:
+            _pad_inplace_with_limit(values, mask, limit)
+
+
+@numba.jit
+def _pad_inplace_no_limit(values: np.ndarray, mask: np.ndarray) -> None:
+    N = len(values)
+    val, prev_mask = values[0], mask[0]
+    for i in range(N):
+        if mask[i]:
+            values[i], mask[i] = val, prev_mask
+        else:
+            val, prev_mask = values[i], mask[i]
+
+
+@numba.jit
+def _pad_inplace_with_limit(values: np.ndarray, mask: np.ndarray, limit: int) -> None:
+    N = len(values)
+    fill_count = 0
+    val, prev_mask = values[0], mask[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= limit:
+                continue
+            fill_count += 1
+            values[i], mask[i] = val, prev_mask
+
+        else:
+            fill_count = 0
+            val, prev_mask = values[i], mask[i]
+
+
+def pad_2d_inplace(
+    values: np.ndarray, mask: np.ndarray, limit: int | None = None
+) -> None:
+    validate_limit(limit)
+    _pad_2d_inplace(values, mask, limit)
+
+
+@numba.jit
+def _pad_2d_inplace(values, mask, limit=None):
+    if values.shape[1]:
+        if limit is None:
+            _pad_2d_inplace_no_limit(values, mask)
+        else:
+            _pad_2d_inplace_with_limit(values, mask, limit)
+
+
+@numba.jit
+def _pad_2d_inplace_no_limit(values, mask):
+    K, N = values.shape
+    for j in range(K):
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                values[j, i] = val
+            else:
+                val = values[j, i]
+
+
+@numba.jit
+def _pad_2d_inplace_with_limit(values, mask, limit):
+    K, N = values.shape
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= limit:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+# """
+# Backfilling logic for generating fill vector
+
+# Diagram of what's going on
+
+# Old      New    Fill vector    Mask
+#          .        0               1
+#          .        0               1
+#          .        0               1
+# A        A        0               1
+#          .        1               1
+#          .        1               1
+#          .        1               1
+#          .        1               1
+#          .        1               1
+# B        B        1               1
+#          .        2               1
+#          .        2               1
+#          .        2               1
+# C        C        2               1
+#          .                        0
+#          .                        0
+# D
+# """
+
+
+# @cython.boundscheck(False)
+# @cython.wraparound(False)
+# def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray:
+#     cdef:
+#         Py_ssize_t i, j, nleft, nright
+#         ndarray[int64_t, ndim=1] indexer
+#         algos_t cur, prev
+#         int lim, fill_count = 0
+
+#     nleft = len(old)
+#     nright = len(new)
+#     indexer = np.empty(nright, dtype=np.int64)
+#     indexer[:] = -1
+
+#     lim = validate_limit(nright, limit)
+
+#     if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+#         return indexer
+
+#     i = nleft - 1
+#     j = nright - 1
+
+#     cur = old[nleft - 1]
+
+#     while j >= 0 and new[j] > cur:
+#         j -= 1
+
+#     while True:
+#         if j < 0:
+#             break
+
+#         if i == 0:
+#             while j >= 0:
+#                 if new[j] == cur:
+#                     indexer[j] = i
+#                 elif new[j] < cur and fill_count < lim:
+#                     indexer[j] = i
+#                     fill_count += 1
+#                 j -= 1
+#             break
+
+#         prev = old[i - 1]
+
+#         while j >= 0 and prev < new[j] <= cur:
+#             if new[j] == cur:
+#                 indexer[j] = i
+#             elif new[j] < cur and fill_count < lim:
+#                 indexer[j] = i
+#                 fill_count += 1
+#             j -= 1
+
+#         fill_count = 0
+#         i -= 1
+#         cur = prev
+
+#     return indexer
+
+
+@numba.njit
+def is_monotonic(arr: np.ndarray) -> tuple[bool, bool, bool]:
+    """
+    Returns
+    -------
+    tuple
+        is_monotonic_inc : bool
+        is_monotonic_dec : bool
+        is_unique : bool
+    """
+    is_monotonic_inc = True
+    is_monotonic_dec = True
+    is_unique = True
+    is_strict_monotonic = True
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0]:
+            # single value is NaN/NaT
+            return False, False, True
+        else:
+            return True, True, True
+    elif n < 2:
+        return True, True, True
+
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if cur < prev:
+            is_monotonic_inc = False
+        elif cur > prev:
+            is_monotonic_dec = False
+        elif cur == prev:
+            is_unique = False
+        else:
+            # cur or prev is NaN/NaT
+            is_monotonic_inc = False
+            is_monotonic_dec = False
+            break
+        if not is_monotonic_inc and not is_monotonic_dec:
+            break
+        prev = cur
+
+    is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
+    return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
+
+
+# # ----------------------------------------------------------------------
+# # rank_1d, rank_2d
+# # ----------------------------------------------------------------------
+
+# ctypedef fused rank_t:
+#     object
+#     float64_t
+#     uint64_t
+#     int64_t
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def rank_1d(
+#     ndarray[rank_t, ndim=1] values,
+#     const int64_t[:] labels,
+#     ties_method="average",
+#     bint ascending=True,
+#     bint pct=False,
+#     na_option="keep",
+# ):
+#     """
+#     Fast NaN-friendly version of ``scipy.stats.rankdata``.
+
+#     Parameters
+#     ----------
+#     values : array of rank_t values to be ranked
+#     labels : array containing unique label for each group, with its ordering
+#         matching up to the corresponding record in `values`. If not called
+#         from a groupby operation, will be an array of 0's
+#     ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
+#         'average'
+#         * average: average rank of group
+#         * min: lowest rank in group
+#         * max: highest rank in group
+#         * first: ranks assigned in order they appear in the array
+#         * dense: like 'min', but rank always increases by 1 between groups
+#     ascending : boolean, default True
+#         False for ranks by high (1) to low (N)
+#         na_option : {'keep', 'top', 'bottom'}, default 'keep'
+#     pct : boolean, default False
+#         Compute percentage rank of data within each group
+#     na_option : {'keep', 'top', 'bottom'}, default 'keep'
+#         * keep: leave NA values where they are
+#         * top: smallest rank if ascending
+#         * bottom: smallest rank if descending
+#     """
+#     cdef:
+#         TiebreakEnumType tiebreak
+#         Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0
+#         Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
+#         ndarray[int64_t, ndim=1] lexsort_indexer
+#         ndarray[float64_t, ndim=1] grp_sizes, out
+#         ndarray[rank_t, ndim=1] masked_vals
+#         ndarray[uint8_t, ndim=1] mask
+#         bint keep_na, at_end, next_val_diff, check_labels
+#         rank_t nan_fill_val
+
+#     tiebreak = tiebreakers[ties_method]
+#     keep_na = na_option == 'keep'
+
+#     N = len(values)
+#     # TODO Cython 3.0: cast won't be necessary (#2992)
+#     assert <Py_ssize_t>len(labels) == N
+#     out = np.empty(N)
+#     grp_sizes = np.ones(N)
+#     # If all 0 labels, can short-circuit later label
+#     # comparisons
+#     check_labels = np.any(labels)
+
+#     # Copy values into new array in order to fill missing data
+#     # with mask, without obfuscating location of missing data
+#     # in values array
+#     if rank_t is object and values.dtype != np.object_:
+#         masked_vals = values.astype('O')
+#     else:
+#         masked_vals = values.copy()
+
+#     if rank_t is object:
+#         mask = missing.isnaobj(masked_vals)
+#     elif rank_t is int64_t:
+#         mask = (masked_vals == NPY_NAT).astype(np.uint8)
+#     elif rank_t is float64_t:
+#         mask = np.isnan(masked_vals).astype(np.uint8)
+#     else:
+#         mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
+
+#     if ascending ^ (na_option == 'top'):
+#         if rank_t is object:
+#             nan_fill_val = Infinity()
+#         elif rank_t is int64_t:
+#             nan_fill_val = np.iinfo(np.int64).max
+#         elif rank_t is uint64_t:
+#             nan_fill_val = np.iinfo(np.uint64).max
+#         else:
+#             nan_fill_val = np.inf
+#         order = (masked_vals, mask, labels)
+#     else:
+#         if rank_t is object:
+#             nan_fill_val = NegInfinity()
+#         elif rank_t is int64_t:
+#             nan_fill_val = np.iinfo(np.int64).min
+#         elif rank_t is uint64_t:
+#             nan_fill_val = 0
+#         else:
+#             nan_fill_val = -np.inf
+
+#         order = (masked_vals, ~mask, labels)
+
+#     np.putmask(masked_vals, mask, nan_fill_val)
+
+#     # lexsort using labels, then mask, then actual values
+#     # each label corresponds to a different group value,
+#     # the mask helps you differentiate missing values before
+#     # performing sort on the actual values
+#     lexsort_indexer = np.lexsort(order).astype(np.int64, copy=False)
+
+#     if not ascending:
+#         lexsort_indexer = lexsort_indexer[::-1]
+
+#     # Loop over the length of the value array
+#     # each incremental i value can be looked up in the lexsort_indexer
+#     # array that we sorted previously, which gives us the location of
+#     # that sorted value for retrieval back from the original
+#     # values / masked_vals arrays
+#     # TODO: de-duplicate once cython supports conditional nogil
+#     if rank_t is object:
+#         for i in range(N):
+#             at_end = i == N - 1
+#             # dups and sum_ranks will be incremented each loop where
+#             # the value / group remains the same, and should be reset
+#             # when either of those change
+#             # Used to calculate tiebreakers
+#             dups += 1
+#             sum_ranks += i - grp_start + 1
+
+#             # Update out only when there is a transition of values or labels.
+#             # When a new value or group is encountered, go back #dups steps(
+#             # the number of occurrence of current value) and assign the ranks
+#             # based on the starting index of the current group (grp_start)
+#             # and the current index
+#             if not at_end:
+#                 next_val_diff = are_diff(masked_vals[lexsort_indexer[i]],
+#                                          masked_vals[lexsort_indexer[i+1]])
+#             else:
+#                 next_val_diff = True
+
+#             if (next_val_diff
+#                     or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])
+#                     or (check_labels
+#                         and (labels[lexsort_indexer[i]]
+#                              != labels[lexsort_indexer[i+1]]))
+#             ):
+#                 # if keep_na, check for missing values and assign back
+#                 # to the result where appropriate
+#                 if keep_na and mask[lexsort_indexer[i]]:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = NaN
+#                         grp_na_count = dups
+#                 elif tiebreak == TIEBREAK_AVERAGE:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
+#                 elif tiebreak == TIEBREAK_MIN:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = i - grp_start - dups + 2
+#                 elif tiebreak == TIEBREAK_MAX:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = i - grp_start + 1
+#                 elif tiebreak == TIEBREAK_FIRST:
+#                     for j in range(i - dups + 1, i + 1):
+#                         if ascending:
+#                             out[lexsort_indexer[j]] = j + 1 - grp_start
+#                         else:
+#                             out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
+#                 elif tiebreak == TIEBREAK_DENSE:
+#                     for j in range(i - dups + 1, i + 1):
+#                         out[lexsort_indexer[j]] = grp_vals_seen
+
+#                 # look forward to the next value (using the sorting in _as)
+#                 # if the value does not equal the current value then we need to
+#                 # reset the dups and sum_ranks, knowing that a new value is
+#                 # coming up. the conditional also needs to handle nan equality
+#                 # and the end of iteration
+#                 if next_val_diff or (mask[lexsort_indexer[i]]
+#                                      ^ mask[lexsort_indexer[i+1]]):
+#                     dups = sum_ranks = 0
+#                     grp_vals_seen += 1
+#                     grp_tie_count += 1
+
+#                 # Similar to the previous conditional, check now if we are
+#                 # moving to a new group. If so, keep track of the index where
+#                 # the new group occurs, so the tiebreaker calculations can
+#                 # decrement that from their position. fill in the size of each
+#                 # group encountered (used by pct calculations later). also be
+#                 # sure to reset any of the items helping to calculate dups
+#                 if (at_end or
+#                         (check_labels
+#                          and (labels[lexsort_indexer[i]]
+#                               != labels[lexsort_indexer[i+1]]))):
+#                     if tiebreak != TIEBREAK_DENSE:
+#                         for j in range(grp_start, i + 1):
+#                             grp_sizes[lexsort_indexer[j]] = \
+#                                 (i - grp_start + 1 - grp_na_count)
+#                     else:
+#                         for j in range(grp_start, i + 1):
+#                             grp_sizes[lexsort_indexer[j]] = \
+#                                 (grp_tie_count - (grp_na_count > 0))
+#                     dups = sum_ranks = 0
+#                     grp_na_count = 0
+#                     grp_tie_count = 0
+#                     grp_start = i + 1
+#                     grp_vals_seen = 1
+#     else:
+#         with nogil:
+#             for i in range(N):
+#                 at_end = i == N - 1
+#                 # dups and sum_ranks will be incremented each loop where
+#                 # the value / group remains the same, and should be reset
+#                 # when either of those change
+#                 # Used to calculate tiebreakers
+#                 dups += 1
+#                 sum_ranks += i - grp_start + 1
+
+#                 # Update out only when there is a transition of values or labels.
+#                 # When a new value or group is encountered, go back #dups steps(
+#                 # the number of occurrence of current value) and assign the ranks
+#                 # based on the starting index of the current group (grp_start)
+#                 # and the current index
+#                 if not at_end:
+#                     next_val_diff = (masked_vals[lexsort_indexer[i]]
+#                                      != masked_vals[lexsort_indexer[i+1]])
+#                 else:
+#                     next_val_diff = True
+
+#                 if (next_val_diff
+#                         or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])
+#                         or (check_labels
+#                             and (labels[lexsort_indexer[i]]
+#                                  != labels[lexsort_indexer[i+1]]))
+#                 ):
+#                     # if keep_na, check for missing values and assign back
+#                     # to the result where appropriate
+#                     if keep_na and mask[lexsort_indexer[i]]:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = NaN
+#                             grp_na_count = dups
+#                     elif tiebreak == TIEBREAK_AVERAGE:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = sum_ranks / <float64_t>dups
+#                     elif tiebreak == TIEBREAK_MIN:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = i - grp_start - dups + 2
+#                     elif tiebreak == TIEBREAK_MAX:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = i - grp_start + 1
+#                     elif tiebreak == TIEBREAK_FIRST:
+#                         for j in range(i - dups + 1, i + 1):
+#                             if ascending:
+#                                 out[lexsort_indexer[j]] = j + 1 - grp_start
+#                             else:
+#                                 out[lexsort_indexer[j]] = \
+#                                     (2 * i - j - dups + 2 - grp_start)
+#                     elif tiebreak == TIEBREAK_DENSE:
+#                         for j in range(i - dups + 1, i + 1):
+#                             out[lexsort_indexer[j]] = grp_vals_seen
+
+#                     # look forward to the next value (using the sorting in
+#                     # lexsort_indexer) if the value does not equal the current
+#                     # value then we need to reset the dups and sum_ranks,
+#                     # knowing that a new value is coming up. the conditional
+#                     # also needs to handle nan equality and the end of iteration
+#                     if next_val_diff or (mask[lexsort_indexer[i]]
+#                                          ^ mask[lexsort_indexer[i+1]]):
+#                         dups = sum_ranks = 0
+#                         grp_vals_seen += 1
+#                         grp_tie_count += 1
+
+#                     # Similar to the previous conditional, check now if we are
+#                     # moving to a new group. If so, keep track of the index where
+#                     # the new group occurs, so the tiebreaker calculations can
+#                     # decrement that from their position. fill in the size of each
+#                     # group encountered (used by pct calculations later). also be
+#                     # sure to reset any of the items helping to calculate dups
+#                     if at_end or (check_labels and
+#                                   (labels[lexsort_indexer[i]]
+#                                    != labels[lexsort_indexer[i+1]])):
+#                         if tiebreak != TIEBREAK_DENSE:
+#                             for j in range(grp_start, i + 1):
+#                                 grp_sizes[lexsort_indexer[j]] = \
+#                                     (i - grp_start + 1 - grp_na_count)
+#                         else:
+#                             for j in range(grp_start, i + 1):
+#                                 grp_sizes[lexsort_indexer[j]] = \
+#                                     (grp_tie_count - (grp_na_count > 0))
+#                         dups = sum_ranks = 0
+#                         grp_na_count = 0
+#                         grp_tie_count = 0
+#                         grp_start = i + 1
+#                         grp_vals_seen = 1
+
+#     if pct:
+#         for i in range(N):
+#             if grp_sizes[i] != 0:
+#                 out[i] = out[i] / grp_sizes[i]
+
+#     return out
+
+
+# def rank_2d(
+#     ndarray[rank_t, ndim=2] in_arr,
+#     int axis=0,
+#     ties_method="average",
+#     bint ascending=True,
+#     na_option="keep",
+#     bint pct=False,
+# ):
+#     """
+#     Fast NaN-friendly version of ``scipy.stats.rankdata``.
+#     """
+#     cdef:
+#         Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0
+#         Py_ssize_t infs
+#         ndarray[float64_t, ndim=2] ranks
+#         ndarray[rank_t, ndim=2] values
+#         ndarray[intp_t, ndim=2] argsort_indexer
+#         ndarray[uint8_t, ndim=2] mask
+#         rank_t val, nan_value
+#         float64_t count, sum_ranks = 0.0
+#         int tiebreak = 0
+#         int64_t idx
+#         bint check_mask, condition, keep_na
+
+#     tiebreak = tiebreakers[ties_method]
+
+#     keep_na = na_option == 'keep'
+#     check_mask = rank_t is not uint64_t
+
+#     if axis == 0:
+#         values = np.asarray(in_arr).T.copy()
+#     else:
+#         values = np.asarray(in_arr).copy()
+
+#     if rank_t is object:
+#         if values.dtype != np.object_:
+#             values = values.astype('O')
+
+#     if rank_t is not uint64_t:
+#         if ascending ^ (na_option == 'top'):
+#             if rank_t is object:
+#                 nan_value = Infinity()
+#             elif rank_t is float64_t:
+#                 nan_value = np.inf
+#             elif rank_t is int64_t:
+#                 nan_value = np.iinfo(np.int64).max
+
+#         else:
+#             if rank_t is object:
+#                 nan_value = NegInfinity()
+#             elif rank_t is float64_t:
+#                 nan_value = -np.inf
+#             elif rank_t is int64_t:
+#                 nan_value = NPY_NAT
+
+#         if rank_t is object:
+#             mask = missing.isnaobj2d(values)
+#         elif rank_t is float64_t:
+#             mask = np.isnan(values)
+#         elif rank_t is int64_t:
+#             mask = values == NPY_NAT
+
+#         np.putmask(values, mask, nan_value)
+#     else:
+#         mask = np.zeros_like(values, dtype=bool)
+
+#     n, k = (<object>values).shape
+#     ranks = np.empty((n, k), dtype='f8')
+
+#     if tiebreak == TIEBREAK_FIRST:
+#         # need to use a stable sort here
+#         argsort_indexer = values.argsort(axis=1, kind='mergesort')
+#         if not ascending:
+#             tiebreak = TIEBREAK_FIRST_DESCENDING
+#     else:
+#         argsort_indexer = values.argsort(1)
+
+#     if not ascending:
+#         argsort_indexer = argsort_indexer[:, ::-1]
+
+#     values = _take_2d(values, argsort_indexer)
+
+#     for i in range(n):
+#         dups = sum_ranks = infs = 0
+
+#         total_tie_count = 0
+#         count = 0.0
+#         for j in range(k):
+#             val = values[i, j]
+#             idx = argsort_indexer[i, j]
+#             if keep_na and check_mask and mask[i, idx]:
+#                 ranks[i, idx] = NaN
+#                 infs += 1
+#                 continue
+
+#             count += 1.0
+
+#             sum_ranks += (j - infs) + 1
+#             dups += 1
+
+#             if rank_t is object:
+#                 condition = (
+#                     j == k - 1 or
+#                     are_diff(values[i, j + 1], val) or
+#                     (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
+#                 )
+#             else:
+#                 condition = (
+#                     j == k - 1 or
+#                     values[i, j + 1] != val or
+#                     (keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
+#                 )
+
+#             if condition:
+#                 if tiebreak == TIEBREAK_AVERAGE:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
+#                 elif tiebreak == TIEBREAK_MIN:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = j - dups + 2
+#                 elif tiebreak == TIEBREAK_MAX:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = j + 1
+#                 elif tiebreak == TIEBREAK_FIRST:
+#                     if rank_t is object:
+#                         raise ValueError('first not supported for non-numeric data')
+#                     else:
+#                         for z in range(j - dups + 1, j + 1):
+#                             ranks[i, argsort_indexer[i, z]] = z + 1
+#                 elif tiebreak == TIEBREAK_FIRST_DESCENDING:
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
+#                 elif tiebreak == TIEBREAK_DENSE:
+#                     total_tie_count += 1
+#                     for z in range(j - dups + 1, j + 1):
+#                         ranks[i, argsort_indexer[i, z]] = total_tie_count
+#                 sum_ranks = dups = 0
+#         if pct:
+#             if tiebreak == TIEBREAK_DENSE:
+#                 ranks[i, :] /= total_tie_count
+#             else:
+#                 ranks[i, :] /= count
+#     if axis == 0:
+#         return ranks.T
+#     else:
+#         return ranks
+
+
+@numba.njit
+def diff_2d(
+    arr: np.ndarray,
+    out: np.ndarray,
+    periods: int,
+    axis: int,
+):
+    f_contig = arr.flags.f_contiguous
+
+    sx, sy = arr.shape
+    if f_contig:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    left = arr[i, j]
+                    right = arr[i - periods, j]
+                    out[i, j] = left - right
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    left = arr[i, j]
+                    right = arr[i, j - periods]
+                    out[i, j] = left - right
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    left = arr[i, j]
+                    right = arr[i - periods, j]
+                    out[i, j] = left - right
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    left = arr[i, j]
+                    right = arr[i, j - periods]
+                    out[i, j] = left - right
+
+
+# ----------------------------------------------------------------------
+# ensure_dtype
+# ----------------------------------------------------------------------
+
+
+def ensure_platform_int(arr):
+    # GH3033, GH1392
+    # platform int is the size of the int pointer, e.g. np.intp
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.intp, copy=False)
+    else:
+        return np.array(arr, dtype=np.intp)
+
+
+def ensure_object(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.object_, copy=False)
+    else:
+        return np.array(arr, dtype=np.object_)
+
+
+def ensure_float64(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.float64, copy=False)
+    else:
+        return np.array(arr, dtype=np.float64)
+
+
+def ensure_float32(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.float32, copy=False)
+    else:
+        return np.array(arr, dtype=np.float32)
+
+
+def ensure_int8(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int8, copy=False)
+    else:
+        return np.array(arr, dtype=np.int8)
+
+
+def ensure_int16(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int16, copy=False)
+    else:
+        return np.array(arr, dtype=np.int16)
+
+
+def ensure_int32(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int32, copy=False)
+    else:
+        return np.array(arr, dtype=np.int32)
+
+
+def ensure_int64(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.int64, copy=False)
+    else:
+        return np.array(arr, dtype=np.int64)
+
+
+def ensure_uint8(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint8, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint8)
+
+
+def ensure_uint16(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint16, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint16)
+
+
+def ensure_uint32(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint32, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint32)
+
+
+def ensure_uint64(arr):
+    if isinstance(arr, np.ndarray):
+        return arr.astype(np.uint64, copy=False)
+    else:
+        return np.array(arr, dtype=np.uint64)
+
+
+# ----------------------------------------------------------------------
+# take_1d, take_2d
+# ----------------------------------------------------------------------
+
+
+def _take_1d_no_python(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    n = indexer.shape[0]
+
+    func = _take_1d_parallel if n > 10_000 else _take_1d_serial
+
+    func(values, indexer, out, fill_value, n)
+
+
+def _take_1d_object(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    n = indexer.shape[0]
+
+    _take_1d_serial_object(values, indexer, out, fill_value, n)
+
+
+def _take_1d(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, n: int
+) -> None:
+    for i in numba.prange(n):
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fill_value
+        else:
+            out[i] = values[idx]
+
+
+_take_1d_parallel = numba.njit(parallel=True)(_take_1d)
+_take_1d_serial = numba.njit(_take_1d)
+_take_1d_serial_object = numba.jit(forceobj=True)(_take_1d)
+
+
+# @numba.njit(void(int8[:], intp[:], int8[:], int8))
+# @numba.njit
+def take_1d_int8_int8(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int8[:], intp[:], int32[:], int32))
+# @numba.njit
+def take_1d_int8_int32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int8[:], intp[:], int64[:], int64))
+# @numba.njit
+def take_1d_int8_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int8[:], intp[:], float64[:], float64))
+# @numba.njit
+def take_1d_int8_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(
+#     void(types.Array(types.int16, 1, "C", readonly=True), intp[:], int16[:], int16)
+# )
+# @numba.njit
+def take_1d_int16_int16(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int16[:], intp[:], int32[:], int32))
+# @numba.njit
+def take_1d_int16_int32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int16[:], intp[:], int64[:], int64))
+# @numba.njit
+def take_1d_int16_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int16[:], intp[:], float64[:], float64))
+# @numba.njit
+def take_1d_int16_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int32[:], intp[:], int32[:], int32))
+# @numba.njit
+def take_1d_int32_int32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int32[:], intp[:], int64[:], int64))
+# @numba.njit
+def take_1d_int32_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int32[:], intp[:], float64[:], float64))
+# @numba.njit
+def take_1d_int32_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int64[:], intp[:], int64[:], int64))
+# @numba.njit
+def take_1d_int64_int64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(int64[:], intp[:], float64[:], float64))
+# @numba.njit
+def take_1d_int64_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(float32[:], intp[:], float32[:], float32))
+# @numba.njit
+def take_1d_float32_float32(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(float32[:], intp[:], float64[:], float64))
+# @numba.njit
+def take_1d_float32_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.njit(
+#     [
+#         void(
+#             types.Array(types.int64, 1, "C", readonly=True),
+#             intp[:],
+#             float64[:],
+#             float64,
+#         ),
+#         void(
+#             float64[:],
+#             intp[:],
+#             float64[:],
+#             float64,
+#         ),
+#     ]
+# )
+# @numba.njit
+def take_1d_float64_float64(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+# @numba.jit(forceobj=True)
+def take_1d_object_object(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_object(values, indexer, out, fill_value)
+
+
+# @numba.njit(void(uint8[:], intp[:], uint8[:], uint8))
+# @numba.njit
+def take_1d_bool_bool(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    _take_1d_no_python(values, indexer, out, fill_value)
+
+
+@numba.jit(forceobj=True)
+def take_1d_bool_object(
+    values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
+) -> None:
+    n = indexer.shape[0]
+
+    for i in range(n):
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fill_value
+        else:
+            out[i] = True if values[idx] > 0 else False
+
+
+# # generated from template
+# include "algos_take_helper.pxi"
diff --git a/pandas/_libs_numba/missing.py b/pandas/_libs_numba/missing.py
new file mode 100644
index 0000000000000..7cc3273222a3e
--- /dev/null
+++ b/pandas/_libs_numba/missing.py
@@ -0,0 +1,553 @@
+# import numbers
+
+# import cython
+# from cython import Py_ssize_t
+# import numpy as np
+
+# cimport numpy as cnp
+# from numpy cimport float64_t, int64_t, ndarray, uint8_t
+
+# cnp.import_array()
+
+# from pandas._libs cimport util
+# from pandas._libs.tslibs.nattype cimport (
+#     c_NaT as NaT,
+#     checknull_with_nat,
+#     is_null_datetimelike,
+# )
+# from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value  # noqa
+
+from decimal import Decimal
+
+# from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
+# from pandas.compat import IS64
+import numba
+
+from pandas._libs.missing import NA
+from pandas._libs.tslibs import is_null_datetimelike
+
+# cdef:
+#     float64_t INF = <float64_t>np.inf
+#     float64_t NEGINF = -INF
+
+#     int64_t NPY_NAT = util.get_nat()
+
+#     bint is_32bit = not IS64
+
+
+# cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
+#     """
+#     Check if two scalars are both NA of matching types.
+
+#     Parameters
+#     ----------
+#     left : Any
+#     right : Any
+#     nan_matches_none : bool, default False
+#         For backwards compatibility, consider NaN as matching None.
+
+#     Returns
+#     -------
+#     bool
+#     """
+#     if left is None:
+#         if nan_matches_none and util.is_nan(right):
+#             return True
+#         return right is None
+#     elif left is C_NA:
+#         return right is C_NA
+#     elif left is NaT:
+#         return right is NaT
+#     elif util.is_float_object(left):
+#         if nan_matches_none and right is None:
+#             return True
+#         return (
+#             util.is_nan(left)
+#             and util.is_float_object(right)
+#             and util.is_nan(right)
+#         )
+#     elif util.is_complex_object(left):
+#         return (
+#             util.is_nan(left)
+#             and util.is_complex_object(right)
+#             and util.is_nan(right)
+#         )
+#     elif util.is_datetime64_object(left):
+#         return (
+#             get_datetime64_value(left) == NPY_NAT
+#             and util.is_datetime64_object(right)
+#             and get_datetime64_value(right) == NPY_NAT
+#         )
+#     elif util.is_timedelta64_object(left):
+#         return (
+#             get_timedelta64_value(left) == NPY_NAT
+#             and util.is_timedelta64_object(right)
+#             and get_timedelta64_value(right) == NPY_NAT
+#         )
+#     return False
+
+
+@numba.jit(forceobj=True)
+def checknull(val: object) -> bool:
+    """
+    Return boolean describing of the input is NA-like, defined here as any
+    of:
+     - None
+     - nan
+     - NaT
+     - np.datetime64 representation of NaT
+     - np.timedelta64 representation of NaT
+     - NA
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    bool
+
+    Notes
+    -----
+    The difference between `checknull` and `checknull_old` is that `checknull`
+    does *not* consider INF or NEGINF to be NA.
+    """
+    return (
+        val is NA or is_null_datetimelike(val, inat_is_null=False) or is_decimal_na(val)
+    )
+
+
+def is_decimal_na(val: object) -> bool:
+    """
+    Is this a decimal.Decimal object Decimal("NAN").
+    """
+    return isinstance(val, Decimal) and val != val
+
+
+# cpdef bint checknull_old(object val):
+#     """
+#     Return boolean describing of the input is NA-like, defined here as any
+#     of:
+#      - None
+#      - nan
+#      - INF
+#      - NEGINF
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     result : bool
+
+#     Notes
+#     -----
+#     The difference between `checknull` and `checknull_old` is that `checknull`
+#     does *not* consider INF or NEGINF to be NA.
+#     """
+#     if checknull(val):
+#         return True
+#     elif util.is_float_object(val) or util.is_complex_object(val):
+#         return val == INF or val == NEGINF
+#     return False
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# cpdef ndarray[uint8_t] isnaobj(ndarray arr):
+#     """
+#     Return boolean mask denoting which elements of a 1-D array are na-like,
+#     according to the criteria defined in `checknull`:
+#      - None
+#      - nan
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     result : ndarray (dtype=np.bool_)
+#     """
+#     cdef:
+#         Py_ssize_t i, n
+#         object val
+#         ndarray[uint8_t] result
+
+#     assert arr.ndim == 1, "'arr' must be 1-D."
+
+#     n = len(arr)
+#     result = np.empty(n, dtype=np.uint8)
+#     for i in range(n):
+#         val = arr[i]
+#         result[i] = checknull(val)
+#     return result.view(np.bool_)
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def isnaobj_old(arr: ndarray) -> ndarray:
+#     """
+#     Return boolean mask denoting which elements of a 1-D array are na-like,
+#     defined as being any of:
+#      - None
+#      - nan
+#      - INF
+#      - NEGINF
+#      - NaT
+#      - NA
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     result : ndarray (dtype=np.bool_)
+#     """
+#     cdef:
+#         Py_ssize_t i, n
+#         object val
+#         ndarray[uint8_t] result
+
+#     assert arr.ndim == 1, "'arr' must be 1-D."
+
+#     n = len(arr)
+#     result = np.zeros(n, dtype=np.uint8)
+#     for i in range(n):
+#         val = arr[i]
+#         result[i] = (
+#             checknull(val)
+#             or util.is_float_object(val) and (val == INF or val == NEGINF)
+#         )
+#     return result.view(np.bool_)
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def isnaobj2d(arr: ndarray) -> ndarray:
+#     """
+#     Return boolean mask denoting which elements of a 2-D array are na-like,
+#     according to the criteria defined in `checknull`:
+#      - None
+#      - nan
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     result : ndarray (dtype=np.bool_)
+
+#     Notes
+#     -----
+#     The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
+#     does *not* consider INF or NEGINF to be NA.
+#     """
+#     cdef:
+#         Py_ssize_t i, j, n, m
+#         object val
+#         ndarray[uint8_t, ndim=2] result
+
+#     assert arr.ndim == 2, "'arr' must be 2-D."
+
+#     n, m = (<object>arr).shape
+#     result = np.zeros((n, m), dtype=np.uint8)
+#     for i in range(n):
+#         for j in range(m):
+#             val = arr[i, j]
+#             if checknull(val):
+#                 result[i, j] = 1
+#     return result.view(np.bool_)
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+# def isnaobj2d_old(arr: ndarray) -> ndarray:
+#     """
+#     Return boolean mask denoting which elements of a 2-D array are na-like,
+#     according to the criteria defined in `checknull_old`:
+#      - None
+#      - nan
+#      - INF
+#      - NEGINF
+#      - NaT
+#      - np.datetime64 representation of NaT
+#      - np.timedelta64 representation of NaT
+
+#     Parameters
+#     ----------
+#     arr : ndarray
+
+#     Returns
+#     -------
+#     ndarray (dtype=np.bool_)
+
+#     Notes
+#     -----
+#     The difference between `isnaobj2d` and `isnaobj2d_old` is that `isnaobj2d`
+#     does *not* consider INF or NEGINF to be NA.
+#     """
+#     cdef:
+#         Py_ssize_t i, j, n, m
+#         object val
+#         ndarray[uint8_t, ndim=2] result
+
+#     assert arr.ndim == 2, "'arr' must be 2-D."
+
+#     n, m = (<object>arr).shape
+#     result = np.zeros((n, m), dtype=np.uint8)
+#     for i in range(n):
+#         for j in range(m):
+#             val = arr[i, j]
+#             if checknull_old(val):
+#                 result[i, j] = 1
+#     return result.view(np.bool_)
+
+
+# def isposinf_scalar(val: object) -> bool:
+#     return util.is_float_object(val) and val == INF
+
+
+# def isneginf_scalar(val: object) -> bool:
+#     return util.is_float_object(val) and val == NEGINF
+
+
+# cdef inline bint is_null_datetime64(v):
+#     # determine if we have a null for a datetime (or integer versions),
+#     # excluding np.timedelta64('nat')
+#     if checknull_with_nat(v):
+#         return True
+#     elif util.is_datetime64_object(v):
+#         return get_datetime64_value(v) == NPY_NAT
+#     return False
+
+
+# cdef inline bint is_null_timedelta64(v):
+#     # determine if we have a null for a timedelta (or integer versions),
+#     # excluding np.datetime64('nat')
+#     if checknull_with_nat(v):
+#         return True
+#     elif util.is_timedelta64_object(v):
+#         return get_timedelta64_value(v) == NPY_NAT
+#     return False
+
+
+# cdef bint checknull_with_nat_and_na(object obj):
+#     # See GH#32214
+#     return checknull_with_nat(obj) or obj is C_NA
+
+
+# # -----------------------------------------------------------------------------
+# # Implementation of NA singleton
+
+
+# def _create_binary_propagating_op(name, is_divmod=False):
+
+#     def method(self, other):
+#         if (other is C_NA or isinstance(other, str)
+#                 or isinstance(other, (numbers.Number, np.bool_))
+#                 or isinstance(other, np.ndarray) and not other.shape):
+#             # Need the other.shape clause to handle NumPy scalars,
+#             # since we do a setitem on `out` below, which
+#             # won't work for NumPy scalars.
+#             if is_divmod:
+#                 return NA, NA
+#             else:
+#                 return NA
+
+#         elif isinstance(other, np.ndarray):
+#             out = np.empty(other.shape, dtype=object)
+#             out[:] = NA
+
+#             if is_divmod:
+#                 return out, out.copy()
+#             else:
+#                 return out
+
+#         return NotImplemented
+
+#     method.__name__ = name
+#     return method
+
+
+# def _create_unary_propagating_op(name):
+#     def method(self):
+#         return NA
+
+#     method.__name__ = name
+#     return method
+
+
+# cdef class C_NAType:
+#     pass
+
+
+# class NAType(C_NAType):
+#     """
+#     NA ("not available") missing value indicator.
+
+#     .. warning::
+
+#        Experimental: the behaviour of NA can still change without warning.
+
+#     .. versionadded:: 1.0.0
+
+#     The NA singleton is a missing value indicator defined by pandas. It is
+#     used in certain new extension dtypes (currently the "string" dtype).
+#     """
+
+#     _instance = None
+
+#     def __new__(cls, *args, **kwargs):
+#         if NAType._instance is None:
+#             NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
+#         return NAType._instance
+
+#     def __repr__(self) -> str:
+#         return "<NA>"
+
+#     def __format__(self, format_spec) -> str:
+#         try:
+#             return self.__repr__().__format__(format_spec)
+#         except ValueError:
+#             return self.__repr__()
+
+#     def __bool__(self):
+#         raise TypeError("boolean value of NA is ambiguous")
+
+#     def __hash__(self):
+#         # GH 30013: Ensure hash is large enough to avoid hash collisions with integers
+#         exponent = 31 if is_32bit else 61
+#         return 2 ** exponent - 1
+
+#     def __reduce__(self):
+#         return "NA"
+
+#     # Binary arithmetic and comparison ops -> propagate
+
+#     __add__ = _create_binary_propagating_op("__add__")
+#     __radd__ = _create_binary_propagating_op("__radd__")
+#     __sub__ = _create_binary_propagating_op("__sub__")
+#     __rsub__ = _create_binary_propagating_op("__rsub__")
+#     __mul__ = _create_binary_propagating_op("__mul__")
+#     __rmul__ = _create_binary_propagating_op("__rmul__")
+#     __matmul__ = _create_binary_propagating_op("__matmul__")
+#     __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
+#     __truediv__ = _create_binary_propagating_op("__truediv__")
+#     __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
+#     __floordiv__ = _create_binary_propagating_op("__floordiv__")
+#     __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
+#     __mod__ = _create_binary_propagating_op("__mod__")
+#     __rmod__ = _create_binary_propagating_op("__rmod__")
+#     __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True)
+#     __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True)
+#     # __lshift__ and __rshift__ are not implemented
+
+#     __eq__ = _create_binary_propagating_op("__eq__")
+#     __ne__ = _create_binary_propagating_op("__ne__")
+#     __le__ = _create_binary_propagating_op("__le__")
+#     __lt__ = _create_binary_propagating_op("__lt__")
+#     __gt__ = _create_binary_propagating_op("__gt__")
+#     __ge__ = _create_binary_propagating_op("__ge__")
+
+#     # Unary ops
+
+#     __neg__ = _create_unary_propagating_op("__neg__")
+#     __pos__ = _create_unary_propagating_op("__pos__")
+#     __abs__ = _create_unary_propagating_op("__abs__")
+#     __invert__ = _create_unary_propagating_op("__invert__")
+
+#     # pow has special
+#     def __pow__(self, other):
+#         if other is C_NA:
+#             return NA
+#         elif isinstance(other, (numbers.Number, np.bool_)):
+#             if other == 0:
+#                 # returning positive is correct for +/- 0.
+#                 return type(other)(1)
+#             else:
+#                 return NA
+#         elif isinstance(other, np.ndarray):
+#             return np.where(other == 0, other.dtype.type(1), NA)
+
+#         return NotImplemented
+
+#     def __rpow__(self, other):
+#         if other is C_NA:
+#             return NA
+#         elif isinstance(other, (numbers.Number, np.bool_)):
+#             if other == 1:
+#                 return other
+#             else:
+#                 return NA
+#         elif isinstance(other, np.ndarray):
+#             return np.where(other == 1, other, NA)
+#         return NotImplemented
+
+#     # Logical ops using Kleene logic
+
+#     def __and__(self, other):
+#         if other is False:
+#             return False
+#         elif other is True or other is C_NA:
+#             return NA
+#         return NotImplemented
+
+#     __rand__ = __and__
+
+#     def __or__(self, other):
+#         if other is True:
+#             return True
+#         elif other is False or other is C_NA:
+#             return NA
+#         return NotImplemented
+
+#     __ror__ = __or__
+
+#     def __xor__(self, other):
+#         if other is False or other is True or other is C_NA:
+#             return NA
+#         return NotImplemented
+
+#     __rxor__ = __xor__
+
+#     __array_priority__ = 1000
+#     _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_)
+
+#     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+#         types = self._HANDLED_TYPES + (NAType,)
+#         for x in inputs:
+#             if not isinstance(x, types):
+#                 return NotImplemented
+
+#         if method != "__call__":
+#             raise ValueError(f"ufunc method '{method}' not supported for NA")
+#         result = maybe_dispatch_ufunc_to_dunder_op(
+#             self, ufunc, method, *inputs, **kwargs
+#         )
+#         if result is NotImplemented:
+#             # For a NumPy ufunc that's not a binop, like np.logaddexp
+#             index = [i for i, x in enumerate(inputs) if x is NA][0]
+#             result = np.broadcast_arrays(*inputs)[index]
+#             if result.ndim == 0:
+#                 result = result.item()
+#             if ufunc.nout > 1:
+#                 result = (NA,) * ufunc.nout
+
+#         return result
+
+
+# C_NA = NAType()   # C-visible
+# NA = C_NA         # Python-visible
diff --git a/pandas/_libs_numba/tslibs/__init__.py b/pandas/_libs_numba/tslibs/__init__.py
new file mode 100644
index 0000000000000..6b9f2d32acd5c
--- /dev/null
+++ b/pandas/_libs_numba/tslibs/__init__.py
@@ -0,0 +1,67 @@
+# __all__ = [
+#     "dtypes",
+#     "localize_pydatetime",
+#     "NaT",
+#     "NaTType",
+#     "iNaT",
+#     "nat_strings",
+#     "is_null_datetimelike",
+#     "OutOfBoundsDatetime",
+#     "OutOfBoundsTimedelta",
+#     "IncompatibleFrequency",
+#     "Period",
+#     "Resolution",
+#     "Timedelta",
+#     "normalize_i8_timestamps",
+#     "is_date_array_normalized",
+#     "dt64arr_to_periodarr",
+#     "delta_to_nanoseconds",
+#     "ints_to_pydatetime",
+#     "ints_to_pytimedelta",
+#     "get_resolution",
+#     "Timestamp",
+#     "tz_convert_from_utc_single",
+#     "to_offset",
+#     "Tick",
+#     "BaseOffset",
+#     "tz_compare",
+# ]
+
+# from pandas._libs.tslibs import dtypes
+# from pandas._libs.tslibs.conversion import (
+#     OutOfBoundsTimedelta,
+#     localize_pydatetime,
+# )
+# from pandas._libs.tslibs.dtypes import Resolution
+# from pandas._libs.tslibs.nattype import (
+#     NaT,
+#     NaTType,
+#     iNaT,
+#     is_null_datetimelike,
+#     nat_strings,
+# )
+# from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
+# from pandas._libs.tslibs.offsets import (
+#     BaseOffset,
+#     Tick,
+#     to_offset,
+# )
+# from pandas._libs.tslibs.period import (
+#     IncompatibleFrequency,
+#     Period,
+# )
+# from pandas._libs.tslibs.timedeltas import (
+#     Timedelta,
+#     delta_to_nanoseconds,
+#     ints_to_pytimedelta,
+# )
+# from pandas._libs.tslibs.timestamps import Timestamp
+# from pandas._libs.tslibs.timezones import tz_compare
+# from pandas._libs.tslibs.tzconversion import tz_convert_from_utc_single
+# from pandas._libs.tslibs.vectorized import (
+#     dt64arr_to_periodarr,
+#     get_resolution,
+#     ints_to_pydatetime,
+#     is_date_array_normalized,
+#     normalize_i8_timestamps,
+# )
diff --git a/pandas/_libs_numba/tslibs/util.py b/pandas/_libs_numba/tslibs/util.py
new file mode 100644
index 0000000000000..ecad25c3e0d81
--- /dev/null
+++ b/pandas/_libs_numba/tslibs/util.py
@@ -0,0 +1,223 @@
+import numpy as np
+
+# from cpython.object cimport PyTypeObject
+
+
+# cdef extern from *:
+#     """
+#     PyObject* char_to_string(const char* data) {
+#         return PyUnicode_FromString(data);
+#     }
+#     """
+#     object char_to_string(const char* data)
+
+
+# cdef extern from "Python.h":
+#     # Note: importing extern-style allows us to declare these as nogil
+#     # functions, whereas `from cpython cimport` does not.
+#     bint PyUnicode_Check(object obj) nogil
+#     bint PyBool_Check(object obj) nogil
+#     bint PyFloat_Check(object obj) nogil
+#     bint PyComplex_Check(object obj) nogil
+#     bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
+
+#     # Note that following functions can potentially raise an exception,
+#     # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can
+#     # potentially allocate memory inside in unlikely case of when underlying
+#     # unicode object was stored as non-utf8 and utf8 wasn't requested before.
+#     const char* PyUnicode_AsUTF8AndSize(object obj,
+#                                         Py_ssize_t* length) except NULL
+
+# from numpy cimport (
+#     float64_t,
+#     int64_t,
+# )
+
+
+# cdef extern from "numpy/arrayobject.h":
+#     PyTypeObject PyFloatingArrType_Type
+
+# cdef extern from "numpy/ndarrayobject.h":
+#     PyTypeObject PyTimedeltaArrType_Type
+#     PyTypeObject PyDatetimeArrType_Type
+#     PyTypeObject PyComplexFloatingArrType_Type
+#     PyTypeObject PyBoolArrType_Type
+
+#     bint PyArray_IsIntegerScalar(obj) nogil
+#     bint PyArray_Check(obj) nogil
+
+# cdef extern from "numpy/npy_common.h":
+#     int64_t NPY_MIN_INT64
+
+
+# cdef inline int64_t get_nat():
+#     return NPY_MIN_INT64
+
+
+# --------------------------------------------------------------------
+# Type Checking
+
+
+def is_integer_object(val: object) -> bool:
+    """
+    Cython equivalent of
+
+    `isinstance(val, (int, long, np.integer)) and not isinstance(val, bool)`
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    bool
+    """
+    return (
+        not isinstance(val, bool)
+        and isinstance(val, (int, np.integer))
+        and not is_timedelta64_object(val)
+    )
+
+
+# cdef inline bint is_float_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, (float, np.complex_))`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_float : bool
+#     """
+#     return (PyFloat_Check(obj) or
+#             (PyObject_TypeCheck(obj, &PyFloatingArrType_Type)))
+
+
+# cdef inline bint is_complex_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, (complex, np.complex_))`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_complex : bool
+#     """
+#     return (PyComplex_Check(obj) or
+#             PyObject_TypeCheck(obj, &PyComplexFloatingArrType_Type))
+
+
+# cdef inline bint is_bool_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, (bool, np.bool_))`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_bool : bool
+#     """
+#     return (PyBool_Check(obj) or
+#             PyObject_TypeCheck(obj, &PyBoolArrType_Type))
+
+
+# cdef inline bint is_real_number_object(object obj) nogil:
+#     return is_bool_object(obj) or is_integer_object(obj) or is_float_object(obj)
+
+
+def is_timedelta64_object(val: object) -> bool:
+    """
+    Cython equivalent of `isinstance(val, np.timedelta64)`
+
+    Parameters
+    ----------
+    val : object
+
+    Returns
+    -------
+    bool
+    """
+    return isinstance(val, np.timedelta64)
+
+
+# cdef inline bint is_datetime64_object(object obj) nogil:
+#     """
+#     Cython equivalent of `isinstance(val, np.datetime64)`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_datetime64 : bool
+#     """
+#     return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)
+
+
+# cdef inline bint is_array(object val):
+#     """
+#     Cython equivalent of `isinstance(val, np.ndarray)`
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_ndarray : bool
+#     """
+#     return PyArray_Check(val)
+
+
+# cdef inline bint is_nan(object val):
+#     """
+#     Check if val is a Not-A-Number float or complex, including
+#     float('NaN') and np.nan.
+
+#     Parameters
+#     ----------
+#     val : object
+
+#     Returns
+#     -------
+#     is_nan : bool
+#     """
+#     cdef float64_t fval
+#     if is_float_object(val):
+#         fval = val
+#         return fval != fval
+#     return is_complex_object(val) and val != val
+
+
+# cdef inline const char* get_c_string_buf_and_size(str py_string,
+#                                                   Py_ssize_t *length) except NULL:
+#     """
+#     Extract internal char* buffer of unicode or bytes object `py_string` with
+#     getting length of this internal buffer saved in `length`.
+
+#     Notes
+#     -----
+#     Python object owns memory, thus returned char* must not be freed.
+#     `length` can be NULL if getting buffer length is not needed.
+
+#     Parameters
+#     ----------
+#     py_string : str
+#     length : Py_ssize_t*
+
+#     Returns
+#     -------
+#     buf : const char*
+#     """
+#     return PyUnicode_AsUTF8AndSize(py_string, length)
+
+
+# cdef inline const char* get_c_string(str py_string) except NULL:
+#     return get_c_string_buf_and_size(py_string, NULL)
diff --git a/pandas/_libs_numba/util.py b/pandas/_libs_numba/util.py
new file mode 100644
index 0000000000000..56239126279ff
--- /dev/null
+++ b/pandas/_libs_numba/util.py
@@ -0,0 +1,50 @@
+# cimport numpy as cnp
+# from numpy cimport ndarray
+
+from pandas._libs_numba.tslibs.util import *  # noqa
+
+# cdef extern from "numpy/ndarraytypes.h":
+#     void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil
+
+
+# cdef extern from "numpy/arrayobject.h":
+#     enum:
+#         NPY_ARRAY_C_CONTIGUOUS
+#         NPY_ARRAY_F_CONTIGUOUS
+
+
+# cdef extern from "src/headers/stdint.h":
+#     enum: UINT8_MAX
+#     enum: UINT16_MAX
+#     enum: UINT32_MAX
+#     enum: UINT64_MAX
+#     enum: INT8_MIN
+#     enum: INT8_MAX
+#     enum: INT16_MIN
+#     enum: INT16_MAX
+#     enum: INT32_MAX
+#     enum: INT32_MIN
+#     enum: INT64_MAX
+#     enum: INT64_MIN
+
+
+# ctypedef fused numeric:
+#     cnp.int8_t
+#     cnp.int16_t
+#     cnp.int32_t
+#     cnp.int64_t
+
+#     cnp.uint8_t
+#     cnp.uint16_t
+#     cnp.uint32_t
+#     cnp.uint64_t
+
+#     cnp.float32_t
+#     cnp.float64_t
+
+
+# cdef inline void set_array_not_contiguous(ndarray ao) nogil:
+#     # Numpy>=1.8-compliant equivalent to:
+#     # ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS);
+#     PyArray_CLEARFLAGS(ao,
+#                        (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS))
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 15f54c11be0a0..4d8631283e0ab 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -8,6 +8,7 @@
 from textwrap import dedent
 from typing import (
     TYPE_CHECKING,
+    Any,
     Dict,
     Optional,
     Tuple,
@@ -28,6 +29,7 @@
     iNaT,
     lib,
 )
+from pandas._libs_numba import algos as algos_numba
 from pandas._typing import (
     AnyArrayLike,
     ArrayLike,
@@ -1305,7 +1307,7 @@ def compute(self, method: str) -> Series:
         narr = len(arr)
         n = min(n, narr)
 
-        kth_val = algos.kth_smallest(arr.copy(), n - 1)
+        kth_val = algos_numba.kth_smallest(arr.copy(), n - 1)
         (ns,) = np.nonzero(arr <= kth_val)
         inds = ns[arr[ns].argsort(kind="mergesort")]
 
@@ -1634,7 +1636,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     """
 
     n = int(n)
-    na = np.nan
+    na: Any = np.nan
     dtype = arr.dtype
 
     is_bool = is_bool_dtype(dtype)
@@ -1666,9 +1668,9 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
 
     is_timedelta = False
     if needs_i8_conversion(arr.dtype):
-        dtype = np.int64
-        arr = arr.view("i8")
-        na = iNaT
+        dtype = np.dtype("timedelta64[ns]")
+        arr = getattr(arr, "_data", arr)
+        na = None
         is_timedelta = True
 
     elif is_bool:
@@ -1698,10 +1700,11 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
     na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
     out_arr[tuple(na_indexer)] = na
 
-    if arr.dtype.name in _diff_special:
+    if arr.dtype.name in _diff_special or is_timedelta:
+        assert isinstance(arr, np.ndarray), type(arr)
         # TODO: can diff_2d dtype specialization troubles be fixed by defining
         #  out_arr inside diff_2d?
-        algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
+        algos_numba.diff_2d(arr, out_arr, n, axis)
     else:
         # To keep mypy happy, _res_indexer is a list while res_indexer is
         #  a tuple, ditto for lag_indexer.
@@ -1715,9 +1718,6 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3):
 
         out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
 
-    if is_timedelta:
-        out_arr = out_arr.view("timedelta64[ns]")
-
     if orig_ndim == 1:
         out_arr = out_arr[:, 0]
     return out_arr
diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py
index ba1b2a0f0e76e..efbb628479f83 100644
--- a/pandas/core/array_algos/take.py
+++ b/pandas/core/array_algos/take.py
@@ -13,6 +13,7 @@
     algos as libalgos,
     lib,
 )
+from pandas._libs_numba import algos as libalgos_numba
 from pandas._typing import ArrayLike
 
 from pandas.core.dtypes.cast import maybe_promote
@@ -354,27 +355,31 @@ def wrapper(
 
 
 _take_1d_dict = {
-    ("int8", "int8"): libalgos.take_1d_int8_int8,
-    ("int8", "int32"): libalgos.take_1d_int8_int32,
-    ("int8", "int64"): libalgos.take_1d_int8_int64,
-    ("int8", "float64"): libalgos.take_1d_int8_float64,
-    ("int16", "int16"): libalgos.take_1d_int16_int16,
-    ("int16", "int32"): libalgos.take_1d_int16_int32,
-    ("int16", "int64"): libalgos.take_1d_int16_int64,
-    ("int16", "float64"): libalgos.take_1d_int16_float64,
-    ("int32", "int32"): libalgos.take_1d_int32_int32,
-    ("int32", "int64"): libalgos.take_1d_int32_int64,
-    ("int32", "float64"): libalgos.take_1d_int32_float64,
-    ("int64", "int64"): libalgos.take_1d_int64_int64,
-    ("int64", "float64"): libalgos.take_1d_int64_float64,
-    ("float32", "float32"): libalgos.take_1d_float32_float32,
-    ("float32", "float64"): libalgos.take_1d_float32_float64,
-    ("float64", "float64"): libalgos.take_1d_float64_float64,
-    ("object", "object"): libalgos.take_1d_object_object,
-    ("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
-    ("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
+    ("int8", "int8"): libalgos_numba.take_1d_int8_int8,
+    ("int8", "int32"): libalgos_numba.take_1d_int8_int32,
+    ("int8", "int64"): libalgos_numba.take_1d_int8_int64,
+    ("int8", "float64"): libalgos_numba.take_1d_int8_float64,
+    ("int16", "int16"): libalgos_numba.take_1d_int16_int16,
+    ("int16", "int32"): libalgos_numba.take_1d_int16_int32,
+    ("int16", "int64"): libalgos_numba.take_1d_int16_int64,
+    ("int16", "float64"): libalgos_numba.take_1d_int16_float64,
+    ("int32", "int32"): libalgos_numba.take_1d_int32_int32,
+    ("int32", "int64"): libalgos_numba.take_1d_int32_int64,
+    ("int32", "float64"): libalgos_numba.take_1d_int32_float64,
+    ("int64", "int64"): libalgos_numba.take_1d_int64_int64,
+    ("int64", "float64"): libalgos_numba.take_1d_int64_float64,
+    ("float32", "float32"): libalgos_numba.take_1d_float32_float32,
+    ("float32", "float64"): libalgos_numba.take_1d_float32_float64,
+    ("float64", "float64"): libalgos_numba.take_1d_float64_float64,
+    ("object", "object"): libalgos_numba.take_1d_object_object,
+    ("bool", "bool"): _view_wrapper(
+        libalgos_numba.take_1d_bool_bool, np.uint8, np.uint8
+    ),
+    ("bool", "object"): _view_wrapper(
+        libalgos_numba.take_1d_bool_object, np.uint8, None
+    ),
     ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
-        libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
+        libalgos_numba.take_1d_int64_int64, np.int64, np.int64, np.int64
     ),
 }
 
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 769ae52744c74..5b6b3fb778cfd 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -24,10 +24,10 @@
 
 from pandas._libs import (
     NaT,
-    algos as libalgos,
     hashtable as htable,
 )
 from pandas._libs.lib import no_default
+from pandas._libs_numba import algos as libalgos
 from pandas._typing import (
     ArrayLike,
     Dtype,
@@ -49,6 +49,7 @@
 from pandas.core.dtypes.common import (
     ensure_int64,
     ensure_object,
+    ensure_platform_int,
     is_categorical_dtype,
     is_datetime64_dtype,
     is_dict_like,
@@ -533,7 +534,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
             # error: Incompatible types in assignment (expression has type "ndarray",
             # variable has type "Categorical")
             result = take_nd(  # type: ignore[assignment]
-                new_cats, libalgos.ensure_platform_int(self._codes)
+                new_cats, ensure_platform_int(self._codes)
             )
 
         return result
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 0900688e04374..b79ebfad380d6 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -22,10 +22,7 @@
 
 import numpy as np
 
-from pandas._libs import (
-    algos,
-    lib,
-)
+from pandas._libs import lib
 from pandas._libs.tslibs import (
     BaseOffset,
     IncompatibleFrequency,
@@ -44,6 +41,7 @@
     round_nsint64,
 )
 from pandas._libs.tslibs.timestamps import integer_op_not_supported
+from pandas._libs_numba import algos
 from pandas._typing import (
     ArrayLike,
     DatetimeLikeScalar,
@@ -1030,11 +1028,11 @@ def _generate_range(
 
     @property
     def _is_monotonic_increasing(self) -> bool:
-        return algos.is_monotonic(self.asi8, timelike=True)[0]
+        return algos.is_monotonic(self._data)[0]
 
     @property
     def _is_monotonic_decreasing(self) -> bool:
-        return algos.is_monotonic(self.asi8, timelike=True)[1]
+        return algos.is_monotonic(self._data)[1]
 
     @property
     def _is_unique(self) -> bool:
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 32ea82d9c0402..e2739c4487e06 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -14,9 +14,9 @@
 from pandas._libs import (
     Interval,
     Period,
-    algos,
 )
 from pandas._libs.tslibs import conversion
+from pandas._libs_numba import algos
 from pandas._typing import (
     ArrayLike,
     DtypeObj,
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 8c2cff21c114e..ee4518903ee0d 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -1,6 +1,8 @@
 """
 missing types & inference
 """
+from __future__ import annotations
+
 from decimal import Decimal
 from functools import partial
 
@@ -15,6 +17,7 @@
     Period,
     iNaT,
 )
+import pandas._libs_numba.missing as libmissing_numba
 from pandas._typing import (
     ArrayLike,
     DtypeObj,
@@ -54,7 +57,7 @@
 INF_AS_NA = False
 
 
-def isna(obj):
+def isna(obj: object):
     """
     Detect missing values for an array-like object.
 
@@ -137,7 +140,7 @@ def isna(obj):
 isnull = isna
 
 
-def _isna(obj, inf_as_na: bool = False):
+def _isna(obj: object, inf_as_na: bool = False):
     """
     Detect missing values, treating None, NaN or NA as null. Infinite
     values will also be treated as null if inf_as_na is True.
@@ -157,7 +160,7 @@ def _isna(obj, inf_as_na: bool = False):
         if inf_as_na:
             return libmissing.checknull_old(obj)
         else:
-            return libmissing.checknull(obj)
+            return libmissing_numba.checknull(obj)
     # hack (for now) because MI registers as ndarray
     elif isinstance(obj, ABCMultiIndex):
         raise NotImplementedError("isna is not defined for MultiIndex")
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index e8b83af16254a..9ce6a60b17d6b 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -27,7 +27,6 @@
 import numpy as np
 
 from pandas._libs import (
-    algos as libalgos,
     index as libindex,
     lib,
 )
@@ -42,6 +41,7 @@
     Timestamp,
     tz_compare,
 )
+from pandas._libs_numba import algos as libalgos
 from pandas._typing import (
     AnyArrayLike,
     ArrayLike,
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 244fcb9f49ec6..d040a6aff6df2 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -23,11 +23,11 @@
 from pandas._config import get_option
 
 from pandas._libs import (
-    algos as libalgos,
     index as libindex,
     lib,
 )
 from pandas._libs.hashtable import duplicated_int64
+from pandas._libs_numba import algos as libalgos
 from pandas._typing import (
     AnyArrayLike,
     DtypeObj,
@@ -1583,9 +1583,7 @@ def is_monotonic_increasing(self) -> bool:
 
         if all(level.is_monotonic for level in self.levels):
             # If each level is sorted, we can operate on the codes directly. GH27495
-            return libalgos.is_lexsorted(
-                [x.astype("int64", copy=False) for x in self.codes]
-            )
+            return libalgos.is_lexsorted(self.codes)
 
         # reversed() because lexsort() wants the most significant key last.
         values = [
@@ -3822,9 +3820,8 @@ def isin(self, values, level=None) -> np.ndarray:
 
 def _lexsort_depth(codes: List[np.ndarray], nlevels: int) -> int:
     """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted."""
-    int64_codes = [ensure_int64(level_codes) for level_codes in codes]
     for k in range(nlevels, 0, -1):
-        if libalgos.is_lexsorted(int64_codes[:k]):
+        if libalgos.is_lexsorted(codes[:k]):
             return k
     return 0
 
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index c2193056cc974..c58e746ba0fdf 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -3,10 +3,7 @@
 """
 from __future__ import annotations
 
-from functools import (
-    partial,
-    wraps,
-)
+from functools import partial
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -14,19 +11,15 @@
     Optional,
     Set,
     Union,
-    cast,
 )
 
 import numpy as np
 
-from pandas._libs import (
-    algos,
-    lib,
-)
+from pandas._libs import lib
+from pandas._libs_numba import algos
 from pandas._typing import (
     ArrayLike,
     Axis,
-    F,
 )
 from pandas.compat._optional import import_optional_dependency
 
@@ -252,7 +245,7 @@ def interpolate_1d(
             )
 
     # default limit is unlimited GH #16282
-    limit = algos.validate_limit(nobs=None, limit=limit)
+    algos.validate_limit(limit=limit)
 
     # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
     all_nans = set(np.flatnonzero(invalid))
@@ -679,61 +672,25 @@ def interpolate_2d(
     return result
 
 
-def _fillna_prep(values, mask: Optional[np.ndarray] = None) -> np.ndarray:
-    # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d
-
+def _pad_1d(values, limit=None, mask=None):
     if mask is None:
         mask = isna(values)
-
-    mask = mask.view(np.uint8)
-    return mask
-
-
-def _datetimelike_compat(func: F) -> F:
-    """
-    Wrapper to handle datetime64 and timedelta64 dtypes.
-    """
-
-    @wraps(func)
-    def new_func(values, limit=None, mask=None):
-        if needs_i8_conversion(values.dtype):
-            if mask is None:
-                # This needs to occur before casting to int64
-                mask = isna(values)
-
-            result, mask = func(values.view("i8"), limit=limit, mask=mask)
-            return result.view(values.dtype), mask
-
-        return func(values, limit=limit, mask=mask)
-
-    return cast(F, new_func)
-
-
-@_datetimelike_compat
-def _pad_1d(
-    values: np.ndarray,
-    limit: int | None = None,
-    mask: np.ndarray | None = None,
-) -> tuple[np.ndarray, np.ndarray]:
-    mask = _fillna_prep(values, mask)
     algos.pad_inplace(values, mask, limit=limit)
     return values, mask
 
 
-@_datetimelike_compat
 def _backfill_1d(
     values: np.ndarray,
     limit: int | None = None,
     mask: np.ndarray | None = None,
 ) -> tuple[np.ndarray, np.ndarray]:
-    mask = _fillna_prep(values, mask)
-    algos.backfill_inplace(values, mask, limit=limit)
-    return values, mask
+    _, new_mask = _pad_1d(values[::-1], limit, mask[::-1] if mask is not None else None)
+    return values, (mask if mask is not None else new_mask)
 
 
-@_datetimelike_compat
 def _pad_2d(values, limit=None, mask=None):
-    mask = _fillna_prep(values, mask)
+    if mask is None:
+        mask = isna(values)
 
     if np.all(values.shape):
         algos.pad_2d_inplace(values, mask, limit=limit)
@@ -743,16 +700,11 @@ def _pad_2d(values, limit=None, mask=None):
     return values, mask
 
 
-@_datetimelike_compat
 def _backfill_2d(values, limit=None, mask=None):
-    mask = _fillna_prep(values, mask)
-
-    if np.all(values.shape):
-        algos.backfill_2d_inplace(values, mask, limit=limit)
-    else:
-        # for test coverage
-        pass
-    return values, mask
+    _, new_mask = _pad_2d(
+        values[:, ::-1], limit, mask[:, ::-1] if mask is not None else None
+    )
+    return values, (mask if mask is not None else new_mask)
 
 
 _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 3aa4d26f7dc8f..12dab0041d4d1 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -18,11 +18,11 @@
 import numpy as np
 
 from pandas._libs import (
-    algos,
     hashtable,
     lib,
 )
 from pandas._libs.hashtable import unique_label_indices
+from pandas._libs_numba import algos
 from pandas._typing import IndexKeyFunc
 
 from pandas.core.dtypes.common import (
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 11bb554a0dc5a..3f92d66c36ca9 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -192,6 +192,7 @@ class TestPDApi(Base):
         "_hashtable",
         "_lib",
         "_libs",
+        "_libs_numba",
         "_np_version_under1p17",
         "_np_version_under1p18",
         "_is_numpy_dev",
diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
index 1a12cbff47092..7a626ce6312c5 100644
--- a/pandas/tests/apply/test_frame_transform.py
+++ b/pandas/tests/apply/test_frame_transform.py
@@ -162,8 +162,6 @@ def test_transform_method_name(method):
 frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail]
 
 
-# mypy doesn't allow adding lists of different types
-# https://github.com/python/mypy/issues/5492
 @pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1])
 def test_transform_bad_dtype(op, frame_or_series):
     # GH 35964
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index c8df18ddaeebe..4778fdfa0cb99 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -9,6 +9,7 @@
     algos as libalgos,
     hashtable as ht,
 )
+from pandas._libs_numba import algos as libalgos_numba
 from pandas.compat import np_array_datetime64_compat
 import pandas.util._test_decorators as td
 
@@ -2112,14 +2113,14 @@ def test_is_lexsorted():
         ),
     ]
 
-    assert not libalgos.is_lexsorted(failure)
+    assert not libalgos_numba.is_lexsorted(failure)
 
 
 def test_groupsort_indexer():
     a = np.random.randint(0, 1000, 100).astype(np.int64)
     b = np.random.randint(0, 1000, 100).astype(np.int64)
 
-    result = libalgos.groupsort_indexer(a, 1000)[0]
+    result = libalgos_numba.groupsort_indexer(a, 1000)[0]
 
     # need to use a stable sort
     # np.argsort returns int, groupsort_indexer
@@ -2133,7 +2134,7 @@ def test_groupsort_indexer():
     # np.lexsort returns int, groupsort_indexer
     # always returns intp
     key = a * 1000 + b
-    result = libalgos.groupsort_indexer(key, 1000000)[0]
+    result = libalgos_numba.groupsort_indexer(key, 1000000)[0]
     expected = np.lexsort((b, a))
     expected = expected.astype(np.intp)
 
@@ -2195,7 +2196,7 @@ def test_infinity_against_nan():
 def test_ensure_platform_int():
     arr = np.arange(100, dtype=np.intp)
 
-    result = libalgos.ensure_platform_int(arr)
+    result = libalgos_numba.ensure_platform_int(arr)
     assert result is arr
 
 
diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py
index c5b875b8f027e..22a4379764137 100644
--- a/pandas/tseries/frequencies.py
+++ b/pandas/tseries/frequencies.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 
-from pandas._libs.algos import unique_deltas
 from pandas._libs.tslibs import (
     Timestamp,
     tzconversion,
@@ -26,6 +25,7 @@
     to_offset,
 )
 from pandas._libs.tslibs.parsing import get_rule_month
+from pandas._libs_numba.algos import unique_deltas
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.common import (
diff --git a/setup.cfg b/setup.cfg
index a0b6a0cdfc260..bcd83f3056f35 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -76,6 +76,7 @@ ignore =
     W504,  # line break after binary operator
     E402,  # module level import not at top of file
     E731,  # do not assign a lambda expression, use a def
+    E741,  # ambiguous variable name
     S001   # found modulo formatter (incorrect picks up mod operations)
 exclude =
     doc/sphinxext/*.py,
@@ -162,7 +163,7 @@ directory = coverage_html_report
 # To be kept consistent with "Import Formatting" section in contributing.rst
 [isort]
 known_pre_libs = pandas._config
-known_pre_core = pandas._libs,pandas._typing,pandas.util._*,pandas.compat,pandas.errors
+known_pre_core = pandas._libs,pandas._libs_numba,pandas._typing,pandas.util._*,pandas.compat,pandas.errors
 known_dtypes = pandas.core.dtypes
 known_post_core = pandas.tseries,pandas.io,pandas.plotting
 sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER