pymc-devs · jessegrabowski · May 23, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/pytensor/link/numba/dispatch/basic.py b/pytensor/link/numba/dispatch/basic.py
@@ -75,7 +75,7 @@ def numba_njit(*args, fastmath=None, **kwargs):
         message=(
             "(\x1b\\[1m)*"  # ansi escape code for bold text
             "Cannot cache compiled function "
-            '"(numba_funcified_fgraph|store_core_outputs|cholesky|solve|solve_triangular|cho_solve|lu_factor)" '
+            '"(numba_funcified_fgraph|store_core_outputs|cholesky|solve|solve_triangular|cho_solve|lu_factor|banded_dot)" '
             "as it uses dynamic globals"
         ),
         category=NumbaWarning,

diff --git a/pytensor/link/numba/dispatch/linalg/_BLAS.py b/pytensor/link/numba/dispatch/linalg/_BLAS.py
@@ -0,0 +1,64 @@
+import ctypes
+
+from numba.core.extending import get_cython_function_address
+from numba.np.linalg import ensure_blas, ensure_lapack, get_blas_kind
+
+from pytensor.link.numba.dispatch.linalg._LAPACK import (
+    _get_float_pointer_for_dtype,
+    _ptr_int,
+)
+
+
+def _get_blas_ptr_and_ptr_type(dtype, name):
+    d = get_blas_kind(dtype)
+    func_name = f"{d}{name}"
+    float_pointer = _get_float_pointer_for_dtype(d)
+    lapack_ptr = get_cython_function_address("scipy.linalg.cython_blas", func_name)
+
+    return lapack_ptr, float_pointer
+
+
+class _BLAS:
+    """
+    Functions to return type signatures for wrapped BLAS functions.
+
+    Here we are specifically concered with BLAS functions exposed by scipy, and not used by numpy.
+
+    Patterned after https://github.com/numba/numba/blob/bd7ebcfd4b850208b627a3f75d4706000be36275/numba/np/linalg.py#L74
+    """
+
+    def __init__(self):
+        ensure_lapack()
+        ensure_blas()
+
+    @classmethod
+    def numba_xgbmv(cls, dtype):
+        """
+        xGBMV performs one of the following matrix operations:
+
+            y = alpha * A @ x + beta * y,   or   y = alpha * A.T @ x + beta * y
+
+        Where alpha and beta are scalars, x and y are vectors, and A is a band matrix with kl sub-diagonals and ku
+        super-diagonals.
+        """
+
+        blas_ptr, float_pointer = _get_blas_ptr_and_ptr_type(dtype, "gbmv")
+
+        functype = ctypes.CFUNCTYPE(
+            None,
+            _ptr_int,  # TRANS
+            _ptr_int,  # M
+            _ptr_int,  # N
+            _ptr_int,  # KL
+            _ptr_int,  # KU
+            float_pointer,  # ALPHA
+            float_pointer,  # A
+            _ptr_int,  # LDA
+            float_pointer,  # X
+            _ptr_int,  # INCX
+            float_pointer,  # BETA
+            float_pointer,  # Y
+            _ptr_int,  # INCY
+        )
+
+        return functype(blas_ptr)
diff --git a/pytensor/link/numba/dispatch/linalg/dot/__init__.py b/pytensor/link/numba/dispatch/linalg/dot/__init__.py
diff --git a/pytensor/link/numba/dispatch/linalg/dot/banded.py b/pytensor/link/numba/dispatch/linalg/dot/banded.py
@@ -0,0 +1,95 @@
+from collections.abc import Callable
+from typing import Any
+
+import numpy as np
+from numba import njit as numba_njit
+from numba.core.extending import overload
+from numba.np.linalg import ensure_blas, ensure_lapack
+from scipy import linalg
+
+from pytensor.link.numba.dispatch.linalg._BLAS import _BLAS
+from pytensor.link.numba.dispatch.linalg._LAPACK import (
+    _get_underlying_float,
+    val_to_int_ptr,
+)
+from pytensor.link.numba.dispatch.linalg.utils import _check_scipy_linalg_matrix
+
+
+@numba_njit(inline="always")
+def A_to_banded(A: np.ndarray, kl: int, ku: int) -> np.ndarray:
+    m, n = A.shape
+
+    # This matrix is build backwards then transposed to get it into Fortran order
+    # (order="F" is not allowed in Numba land)
+    A_banded = np.zeros((n, kl + ku + 1), dtype=A.dtype).T
+
+    for i, k in enumerate(range(ku, -kl - 1, -1)):
+        if k >= 0:
+            A_banded[i, k:] = np.diag(A, k=k)
+        else:
+            A_banded[i, : n + k] = np.diag(A, k=k)
+
+    return A_banded
+
+
+def _dot_banded(A: np.ndarray, x: np.ndarray, kl: int, ku: int) -> Any:
+    """
+    Thin wrapper around gmbv. This code will only be called if njit is disabled globally
+    (e.g. during testing)
+    """
+    fn = linalg.get_blas_funcs("gbmv", (A, x))
+    m, n = A.shape
+    A_banded = A_to_banded(A, kl=kl, ku=ku)
+
+    return fn(m=m, n=n, kl=kl, ku=ku, alpha=1, a=A_banded, x=x)
+
+
+@overload(_dot_banded)
+def dot_banded_impl(
+    A: np.ndarray, x: np.ndarray, kl: int, ku: int
+) -> Callable[[np.ndarray, np.ndarray, int, int], np.ndarray]:
+    ensure_lapack()
+    ensure_blas()
+    _check_scipy_linalg_matrix(A, "dot_banded")
+    dtype = A.dtype
+    w_type = _get_underlying_float(dtype)
+    numba_gbmv = _BLAS().numba_xgbmv(dtype)
+
+    def impl(A: np.ndarray, x: np.ndarray, kl: int, ku: int) -> np.ndarray:
+        m, n = A.shape
+
+        A_banded = A_to_banded(A, kl=kl, ku=ku)
+
+        TRANS = val_to_int_ptr(ord("N"))
+        M = val_to_int_ptr(m)
+        N = val_to_int_ptr(n)
+        LDA = val_to_int_ptr(A_banded.shape[0])
+
+        KL = val_to_int_ptr(kl)
+        KU = val_to_int_ptr(ku)
+
+        ALPHA = np.array(1.0, dtype=dtype)
+        INCX = val_to_int_ptr(x.strides[0] // x.itemsize)
             // gemv expects pointers to the beginning of memory arrays, 
             // but numpy provides provides a pointer to the first element, 
             // so when the stride is negative, we need to get the last one. 
             if (Sx < 0) 
                 x_data += (Nz0 - 1) * Sx; 
             if (Sy < 0) 
                 y_data += (Nz1 - 1) * Sy; 
             // gemv expects pointers to the beginning of memory arrays, 
             // but numpy provides provides a pointer to the first element, 
             // so when the stride is negative, we need to get the last one. 
             if (Sx < 0) 
                 x_data += (Nz0 - 1) * Sx; 
             if (Sy < 0) 
                 y_data += (Nz1 - 1) * Sy; 
+        BETA = np.array(0.0, dtype=dtype)
+        Y = np.empty(m, dtype=dtype)
+        INCY = val_to_int_ptr(1)
+
+        numba_gbmv(
+            TRANS,
+            M,
+            N,
+            KL,
+            KU,
+            ALPHA.view(w_type).ctypes,
+            A_banded.view(w_type).ctypes,
+            LDA,
+            x.view(w_type).ctypes,
+            INCX,
+            BETA.view(w_type).ctypes,
+            Y.view(w_type).ctypes,
+            INCY,
+        )
+
+        return Y
+
+    return impl
diff --git a/pytensor/link/numba/dispatch/slinalg.py b/pytensor/link/numba/dispatch/slinalg.py
@@ -11,6 +11,7 @@
     _pivot_to_permutation,
 )
 from pytensor.link.numba.dispatch.linalg.decomposition.lu_factor import _lu_factor
+from pytensor.link.numba.dispatch.linalg.dot.banded import _dot_banded
 from pytensor.link.numba.dispatch.linalg.solve.cholesky import _cho_solve
 from pytensor.link.numba.dispatch.linalg.solve.general import _solve_gen
 from pytensor.link.numba.dispatch.linalg.solve.posdef import _solve_psd
@@ -19,6 +20,7 @@
 from pytensor.link.numba.dispatch.linalg.solve.tridiagonal import _solve_tridiagonal
 from pytensor.tensor.slinalg import (
     LU,
+    BandedDot,
     BlockDiagonal,
     Cholesky,
     CholeskySolve,
@@ -311,3 +313,19 @@
         )
 
     return cho_solve
+
+
+@numba_funcify.register(BandedDot)
+def numba_funcify_BandedDot(op, node, **kwargs):
+    kl = op.lower_diags
+    ku = op.upper_diags
+    dtype = node.inputs[0].dtype
+
+    if dtype in complex_dtypes:
+        raise NotImplementedError(_COMPLEX_DTYPE_NOT_SUPPORTED_MSG.format(op=op))
+
+    @numba_njit(cache=False)
+    def banded_dot(A, x):
+        return _dot_banded(A, x, kl=kl, ku=ku)
+
+    return banded_dot
diff --git a/pytensor/tensor/slinalg.py b/pytensor/tensor/slinalg.py
@@ -6,10 +6,12 @@
 
 import numpy as np
 import scipy.linalg as scipy_linalg
+from numpy import zeros
 from numpy.exceptions import ComplexWarning
 
 import pytensor
 import pytensor.tensor as pt
+from pytensor import Variable
 from pytensor.gradient import DisconnectedType
 from pytensor.graph.basic import Apply
 from pytensor.graph.op import Op
@@ -1669,6 +1671,92 @@
     return _block_diagonal_matrix(*matrices)
 
 
+class BandedDot(Op):
+    __props__ = ("lower_diags", "upper_diags")
+    gufunc_signature = "(m,n),(n)->(m)"
+
+    def __init__(self, lower_diags, upper_diags):
+        self.lower_diags = lower_diags
+        self.upper_diags = upper_diags
+
+    def make_node(self, A, x):
+        if A.ndim != 2:
+            raise TypeError("A must be a 2D tensor")
+        if x.ndim != 1:
+            raise TypeError("x must be a 1D tensor")
+
+        A = as_tensor_variable(A)
+        x = as_tensor_variable(x)
+
+        out_dtype = pytensor.scalar.upcast(A.dtype, x.dtype)
+        output = x.type.clone(dtype=out_dtype)()
+
+        return pytensor.graph.basic.Apply(self, [A, x], [output])
+
+    def infer_shape(self, fgraph, nodes, shapes):
+        A_shape, _ = shapes
+        return [(A_shape[0],)]
+
+    def perform(self, node, inputs, outputs_storage):
+        A, x = inputs
+        m, n = A.shape
+
+        kl = self.lower_diags
+        ku = self.upper_diags
+
+        A_banded = zeros((kl + ku + 1, n), dtype=A.dtype, order="F")
+
+        for i, k in enumerate(range(ku, -kl - 1, -1)):
+            if k >= 0:
+                A_banded[i, k:] = np.diag(A, k=k)
+            else:
+                A_banded[i, : n + k] = np.diag(A, k=k)
+
+        fn = scipy_linalg.get_blas_funcs("gbmv", dtype=A.dtype)
+        outputs_storage[0][0] = fn(m=m, n=n, kl=kl, ku=ku, alpha=1, a=A_banded, x=x)
+
+    def L_op(self, inputs, outputs, output_grads) -> list[Variable]:
+        # This is exactly the same as the usual gradient of a matrix-vector product, except that the banded structure
+        # is exploited.
+        A, x = inputs
+        (G_bar,) = output_grads
+
+        A_bar = pt.outer(G_bar, x.T)
+        x_bar = banded_dot(
+            A.T, G_bar, lower_diags=self.lower_diags, upper_diags=self.upper_diags
+        )
+
+        return [A_bar, x_bar]
+
+
+def banded_dot(A: TensorLike, x: TensorLike, lower_diags: int, upper_diags: int):
+    """
+    Specialized matrix-vector multiplication for cases when A is a banded matrix
+
+    No type-checking is done on A at runtime, so all data in A off the banded diagonals will be ignored. This will lead
+    to incorrect results if A is not actually a banded matrix.
+
+    Unlike dot, this function is only valid if b is a vector.
+
+    Parameters
+    ----------
+    A: Tensorlike
+        Matrix to perform banded dot on.
+    x: Tensorlike
+        Vector to perform banded dot on.
+    lower_diags: int
+        Number of nonzero lower diagonals of A
+    upper_diags: int
+        Number of nonzero upper diagonals of A
+
+    Returns
+    -------
+    out: Tensor
+        The matrix multiplication result
+    """
+    return Blockwise(BandedDot(lower_diags, upper_diags))(A, x)
+
+
 __all__ = [
     "cholesky",
     "solve",
@@ -1683,4 +1771,5 @@
     "lu",
     "lu_factor",
     "lu_solve",
+    "banded_dot",
 ]
diff --git a/tests/link/numba/test_slinalg.py b/tests/link/numba/test_slinalg.py
@@ -15,8 +15,10 @@
     LUFactor,
     Solve,
     SolveTriangular,
+    banded_dot,
 )
 from tests.link.numba.test_basic import compare_numba_and_py, numba_inplace_mode
+from tests.tensor.test_slinalg import _make_banded_A
 
 
 pytestmark = pytest.mark.filterwarnings("error")
@@ -720,3 +722,40 @@ def test_lu_solve(b_func, b_shape: tuple[int, ...], trans: bool, overwrite_b: bo
 
     # Can never destroy non-contiguous inputs
     np.testing.assert_allclose(b_val_not_contig, b_val)
+
+
+@pytest.mark.parametrize("stride", [1, 2, -1], ids=lambda x: f"stride={x}")
+def test_banded_dot(stride):
+    rng = np.random.default_rng()
+
+    A_val = _make_banded_A(rng.normal(size=(10, 10)), kl=1, ku=1).astype(config.floatX)
+
+    x_shape = (10 * abs(stride),)
+    x_val = rng.normal(size=x_shape).astype(config.floatX)
+    x_val = x_val[::stride]
+
+    A = pt.tensor("A", shape=A_val.shape, dtype=A_val.dtype)
+    x = pt.tensor("x", shape=x_val.shape, dtype=x_val.dtype)
+
+    output = banded_dot(A, x, upper_diags=1, lower_diags=1)
+
+    compare_numba_and_py(
+        [A, x],
+        output,
+        test_inputs=[A_val, x_val],
+        inplace=True,
+        numba_mode=numba_inplace_mode,
+        eval_obj_mode=False,
+    )
+
+    # Test non-contiguous x input
+    x_val = rng.normal(size=(20,))[::2]
+
+    compare_numba_and_py(
+        [A, x],
+        output,
+        test_inputs=[A_val, x_val],
+        inplace=True,
+        numba_mode=numba_inplace_mode,
+        eval_obj_mode=False,
+    )