From 1fa802c32044a0551e1b28e7ae3f902fc0f8605f Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 11 Sep 2023 21:04:08 -0400
Subject: [PATCH 01/18] ENH: Add numba engine to df.apply

---
 pandas/core/_numba/extensions.py       | 448 +++++++++++++++++++++++++
 pandas/core/apply.py                   | 117 ++++++-
 pandas/tests/apply/test_frame_apply.py |   6 +-
 3 files changed, 567 insertions(+), 4 deletions(-)
 create mode 100644 pandas/core/_numba/extensions.py

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
new file mode 100644
index 0000000000000..1627b4fc3987f
--- /dev/null
+++ b/pandas/core/_numba/extensions.py
@@ -0,0 +1,448 @@
+"""
+Utility classes/functions to let numba recognize
+pandas Index/Series/DataFrame
+
+Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py
+"""
+
+from __future__ import annotations
+
+import operator
+
+import numba
+from numba.core import (
+    cgutils,
+    types,
+)
+from numba.core.datamodel import models
+from numba.core.extending import (
+    NativeValue,
+    box,
+    lower_builtin,
+    make_attribute_wrapper,
+    overload,
+    overload_attribute,
+    overload_method,
+    register_model,
+    type_callable,
+    typeof_impl,
+    unbox,
+)
+from numba.core.imputils import impl_ret_borrowed
+import numpy as np
+
+from pandas.core.indexes.base import Index
+from pandas.core.indexing import _iLocIndexer
+from pandas.core.series import Series
+
+
+# TODO: Range index support
+# (not passing an index to series constructor doesn't work)
+class IndexType(types.Buffer):
+    """
+    The type class for Index objects.
+    """
+
+    def __init__(self, dtype, layout, pyclass) -> None:
+        self.pyclass = pyclass
+        super().__init__(dtype, 1, layout)
+
+    @property
+    def key(self):
+        return self.pyclass, self.dtype, self.layout
+
+    @property
+    def as_array(self):
+        return types.Array(self.dtype, 1, self.layout)
+
+    def copy(self, dtype=None, ndim: int = 1, layout=None):
+        assert ndim == 1
+        if dtype is None:
+            dtype = self.dtype
+        layout = layout or self.layout
+        return type(self)(dtype, layout, self.pyclass)
+
+
+class SeriesType(types.ArrayCompatible):
+    """
+    The type class for Series objects.
+    """
+
+    def __init__(self, dtype, index, namety) -> None:
+        assert isinstance(index, IndexType)
+        self.dtype = dtype
+        self.index = index
+        self.values = types.Array(self.dtype, 1, "C")
+        self.namety = namety
+        name = f"series({dtype}, {index}, {namety})"
+        super().__init__(name)
+
+    @property
+    def key(self):
+        return self.dtype, self.index, self.namety
+
+    @property
+    def as_array(self):
+        return self.values
+
+    def copy(self, dtype=None, ndim: int = 1, layout: str = "C"):
+        assert ndim == 1
+        assert layout == "C"
+        if dtype is None:
+            dtype = self.dtype
+        return type(self)(dtype, self.index, self.namety)
+
+
+@typeof_impl.register(Index)
+def typeof_index(val, c):
+    """
+    This will assume that only strings are in object dtype
+    index.
+    (you should check this before this gets lowered down to numba)
+    """
+    arrty = typeof_impl(val._data, c)
+    assert arrty.ndim == 1
+    return IndexType(arrty.dtype, arrty.layout, type(val))
+
+
+@typeof_impl.register(Series)
+def typeof_series(val, c):
+    index = typeof_impl(val.index, c)
+    arrty = typeof_impl(val.values, c)
+    namety = typeof_impl(val.name, c)
+    assert arrty.ndim == 1
+    assert arrty.layout == "C"
+    return SeriesType(arrty.dtype, index, namety)
+
+
+@type_callable(Series)
+def type_series_constructor(context):
+    def typer(data, index, name=None):
+        if isinstance(index, IndexType) and isinstance(data, types.Array):
+            # assert data.layout == "C"
+            assert data.ndim == 1
+            if name is None:
+                name = types.intp
+            return SeriesType(data.dtype, index, name)
+
+    return typer
+
+
+@type_callable(Index)
+def type_index_constructor(context):
+    def typer(data, hashmap=None):
+        if isinstance(data, types.Array):
+            assert data.layout == "C"
+            assert data.ndim == 1
+            assert hashmap is None or isinstance(hashmap, types.DictType)
+            return IndexType(data.dtype, layout=data.layout, pyclass=Index)
+
+    return typer
+
+
+# Backend extensions for Index and Series and Frame
+@register_model(IndexType)
+class IndexModel(models.StructModel):
+    def __init__(self, dmm, fe_type) -> None:
+        members = [
+            ("data", fe_type.as_array),
+            # This is an attempt to emulate our hashtable code with a numba
+            # typed dict
+            # It maps from values in the index to their integer positions in the array
+            ("hashmap", types.DictType(fe_type.dtype, types.intp)),
+        ]
+        models.StructModel.__init__(self, dmm, fe_type, members)
+
+
+@register_model(SeriesType)
+class SeriesModel(models.StructModel):
+    def __init__(self, dmm, fe_type) -> None:
+        members = [
+            ("index", fe_type.index),
+            ("values", fe_type.as_array),
+            ("name", fe_type.namety),
+        ]
+        models.StructModel.__init__(self, dmm, fe_type, members)
+
+
+make_attribute_wrapper(IndexType, "data", "_data")
+make_attribute_wrapper(IndexType, "hashmap", "hashmap")
+
+make_attribute_wrapper(SeriesType, "index", "index")
+make_attribute_wrapper(SeriesType, "values", "values")
+make_attribute_wrapper(SeriesType, "name", "name")
+
+
+@lower_builtin(Series, types.Array, IndexType)
+def pdseries_constructor(context, builder, sig, args):
+    data, index = args
+    series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+    series.index = index
+    series.values = data
+    series.name = context.get_constant(types.intp, 0)
+    return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
+
+
+@lower_builtin(Series, types.Array, IndexType, types.intp)
+@lower_builtin(Series, types.Array, IndexType, types.float64)
+@lower_builtin(Series, types.Array, IndexType, types.unicode_type)
+def pdseries_constructor(context, builder, sig, args):
+    data, index, name = args
+    series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+    series.index = index
+    series.values = data
+    series.name = name
+    return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
+
+
+@lower_builtin(Index, types.Array, types.DictType)
+def index_constructor_2arg(context, builder, sig, args):
+    (data, hashmap) = args
+    index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+
+    index.data = data
+    index.hashmap = hashmap
+    return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
+
+
+@lower_builtin(Index, types.Array)
+def index_constructor_1arg(context, builder, sig, args):
+    from numba.typed import Dict
+
+    key_type = sig.return_type.dtype
+    value_type = types.intp
+
+    def index_impl(data):
+        return Index(data, Dict.empty(key_type, value_type))
+
+    return context.compile_internal(builder, index_impl, sig, args)
+
+
+@unbox(IndexType)
+def unbox_index(typ, obj, c):
+    """
+    Convert a Index object to a native structure.
+
+    If it is object dtype, we'll attempt to cast it to one of
+    numpy's string dtypes.
+    (you are responsible for validating that the Index contains only
+    strings if its object type before lowering it to numba)
+    """
+    data_obj = c.pyapi.object_getattr_string(obj, "_data")
+    index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
+    index.data = c.unbox(typ.as_array, data_obj).value
+    typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
+    # Create an empty typed dict in numba
+    # equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
+    arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
+    intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
+    hashmap_obj = c.pyapi.call_method(
+        typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
+    )
+    index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
+
+    # Decrefs
+    c.pyapi.decref(data_obj)
+    c.pyapi.decref(arr_type_obj)
+    c.pyapi.decref(intp_type_obj)
+    c.pyapi.decref(typed_dict_obj)
+
+    return NativeValue(index._getvalue())
+
+
+@unbox(SeriesType)
+def unbox_series(typ, obj, c):
+    """
+    Convert a Series object to a native structure.
+    """
+    index_obj = c.pyapi.object_getattr_string(obj, "index")
+    values_obj = c.pyapi.object_getattr_string(obj, "values")
+    name_obj = c.pyapi.object_getattr_string(obj, "name")
+
+    series = cgutils.create_struct_proxy(typ)(c.context, c.builder)
+    series.index = c.unbox(typ.index, index_obj).value
+    series.values = c.unbox(typ.values, values_obj).value
+    series.name = c.unbox(typ.namety, name_obj).value
+
+    # Decrefs
+    c.pyapi.decref(index_obj)
+    c.pyapi.decref(values_obj)
+    c.pyapi.decref(name_obj)
+
+    return NativeValue(series._getvalue())
+
+
+@box(IndexType)
+def box_index(typ, val, c):
+    """
+    Convert a native index structure to a Index object.
+
+    If our native index is of a numpy string dtype, we'll cast it to
+    object.
+    """
+    # First build a Numpy array object, then wrap it in a Index
+    index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
+
+    # TODO: preserve the original class for the index
+    # Also need preserve the name of the Index
+
+    # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
+    class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
+    array_obj = c.box(typ.as_array, index.data)
+    # this is basically Index._simple_new(array_obj, name_obj) in python
+    index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
+
+    # Decrefs
+    c.pyapi.decref(class_obj)
+    c.pyapi.decref(array_obj)
+    return index_obj
+
+
+@box(SeriesType)
+def box_series(typ, val, c):
+    """
+    Convert a native series structure to a Series object.
+    """
+    series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
+    class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series))
+    index_obj = c.box(typ.index, series.index)
+    array_obj = c.box(typ.as_array, series.values)
+    name_obj = c.box(typ.namety, series.name)
+    true_obj = c.pyapi.unserialize(c.pyapi.serialize_object(True))
+    # TODO: Is borrowing none here safe?
+    # This is equivalent of pd.Series(data=array_obj, index=index_obj, dtype=None, name=name_obj, copy=None, fastpath=True)
+    series_obj = c.pyapi.call_function_objargs(
+        class_obj,
+        (
+            array_obj,
+            index_obj,
+            c.pyapi.borrow_none(),
+            name_obj,
+            c.pyapi.borrow_none(),
+            true_obj,
+        ),
+    )
+
+    # Decrefs
+    c.pyapi.decref(class_obj)
+    c.pyapi.decref(index_obj)
+    c.pyapi.decref(array_obj)
+    c.pyapi.decref(name_obj)
+    c.pyapi.decref(true_obj)
+
+    return series_obj
+
+
+# Add common series reductions
+
+
+def generate_series_reduction(reduction, reduction_method):
+    @overload_method(SeriesType, reduction)
+    def series_reduction(series):
+        def series_reduction_impl(series):
+            return reduction_method(series.values)
+
+        return series_reduction_impl
+
+    return series_reduction
+
+
+series_reductions = [
+    ("sum", np.sum),
+    ("mean", np.mean),
+    ("std", np.std),
+    ("var", np.var),
+]
+for reduction, reduction_method in series_reductions:
+    generate_series_reduction(reduction, reduction_method)
+
+
+# get_loc on Index
+@overload_method(IndexType, "get_loc")
+def index_get_loc(index, item):
+    def index_get_loc_impl(index, item):
+        # Initialize the hash table if not initalized
+        if len(index.hashmap) == 0:
+            for i, val in enumerate(index._data):
+                index.hashmap[val] = i
+        return index.hashmap[item]
+
+    return index_get_loc_impl
+
+
+# Indexing for Series
+
+
+@overload(operator.getitem)
+def series_indexing(series, item):
+    if isinstance(series, SeriesType):
+
+        def series_getitem(series, item):
+            loc = series.index.get_loc(item)
+            return series.iloc[loc]
+
+        return series_getitem
+
+
+class IlocType(types.Type):
+    def __init__(self, obj_type) -> None:
+        self.obj_type = obj_type
+        name = f"iLocIndexer({obj_type})"
+        super().__init__(name=name)
+
+    @property
+    def key(self):
+        return self.obj_type
+
+
+@typeof_impl.register(_iLocIndexer)
+def typeof_iloc(val, c):
+    objtype = typeof_impl(val.obj, c)
+    return IlocType(objtype)
+
+
+@type_callable(_iLocIndexer)
+def type_iloc_constructor(context):
+    def typer(obj):
+        if isinstance(obj, SeriesType):
+            return IlocType(obj)
+
+    return typer
+
+
+@lower_builtin(_iLocIndexer, SeriesType)
+def iloc_constructor(context, builder, sig, args):
+    (obj,) = args
+    iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+    iloc_indexer.obj = obj
+    return impl_ret_borrowed(
+        context, builder, sig.return_type, iloc_indexer._getvalue()
+    )
+
+
+@register_model(IlocType)
+class ILocModel(models.StructModel):
+    def __init__(self, dmm, fe_type) -> None:
+        members = [("obj", fe_type.obj_type)]
+        models.StructModel.__init__(self, dmm, fe_type, members)
+
+
+make_attribute_wrapper(IlocType, "obj", "obj")
+
+
+@overload_attribute(SeriesType, "iloc")
+def series_iloc(series):
+    def get(series):
+        return _iLocIndexer(series)
+
+    return get
+
+
+@overload(operator.getitem)
+def iloc_getitem(iloc_indexer, i):
+    if isinstance(iloc_indexer, IlocType):
+
+        def getitem_impl(iloc_indexer, i):
+            return iloc_indexer.obj.values[i]
+
+        return getitem_impl
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 4d6dd8f4fd577..a7264940be16f 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -2,6 +2,7 @@
 
 import abc
 from collections import defaultdict
+import functools
 from functools import partial
 import inspect
 from typing import (
@@ -29,6 +30,7 @@
     NDFrameT,
     npt,
 )
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import SpecificationError
 from pandas.util._decorators import cache_readonly
 from pandas.util._exceptions import find_stack_level
@@ -777,6 +779,15 @@ def result_columns(self) -> Index:
     def series_generator(self) -> Generator[Series, None, None]:
         pass
 
+    @property
+    @abc.abstractmethod
+    def generate_numba_apply_func(self) -> Callable[[npt.NDarray], dict[int, Any]]:
+        pass
+
+    @abc.abstractmethod
+    def apply_with_numba(self):
+        pass
+
     @abc.abstractmethod
     def wrap_results_for_axis(
         self, results: ResType, res_index: Index
@@ -956,7 +967,13 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
         return result
 
     def apply_standard(self):
-        results, res_index = self.apply_series_generator()
+        if self.engine == "python":
+            results, res_index = self.apply_series_generator()
+        else:
+            results, res_index = self.apply_series_numba()
+
+        # print(results)
+        # print(res_index)
 
         # wrap results
         return self.wrap_results(results, res_index)
@@ -980,6 +997,10 @@ def apply_series_generator(self) -> tuple[ResType, Index]:
 
         return results, res_index
 
+    def apply_series_numba(self):
+        results = self.apply_with_numba()
+        return results, self.result_index
+
     def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
         from pandas import Series
 
@@ -1019,6 +1040,45 @@ class FrameRowApply(FrameApply):
     def series_generator(self) -> Generator[Series, None, None]:
         return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
 
+    @staticmethod
+    @functools.cache
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDarray], dict[int, Any]]:
+        from pandas import Series
+
+        numba = import_optional_dependency("numba")
+
+        jitted_udf = numba.extending.register_jitable(func)
+
+        @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
+        def numba_func(values, col_names, df_index):
+            results = {}
+            for j in range(values.shape[1]):
+                # Create the series
+                # TODO: No need for the str call?
+                # Need to adapt types to accept UnicodeCharSeq in Series constructor
+                ser = Series(values[:, j], index=df_index, name=str(col_names[j]))
+
+                results[j] = jitted_udf(ser)
+
+            return results
+
+        return numba_func
+
+    def apply_with_numba(self) -> dict[int, Any]:
+        nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs)
+        col_names_values = self.columns._data
+        if col_names_values.dtype == object:
+            if not lib.is_string_array(col_names_values):
+                raise ValueError(
+                    "The numba engine only supports using string or numeric column names"
+                )
+            col_names_values = col_names_values.astype("U")
+        df_index = self.obj.index
+
+        return nb_func(self.values, col_names_values, df_index)
+
     @property
     def result_index(self) -> Index:
         return self.columns
@@ -1102,6 +1162,61 @@ def series_generator(self) -> Generator[Series, None, None]:
                 object.__setattr__(ser, "_name", name)
                 yield ser
 
+    @staticmethod
+    @functools.cache
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]:
+        # Unused import just to register the extensions
+
+        from pandas import (
+            Index,
+            Series,
+        )
+
+        numba = import_optional_dependency("numba")
+
+        jitted_udf = numba.extending.register_jitable(func)
+
+        @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
+        def numba_func(values, col_names, index_values):
+            results = {}
+            col_names_index = Index(col_names)
+            for i in range(values.shape[0]):
+                # Create the series
+                # TODO: values corrupted without the copy
+                ser = Series(
+                    values[i].copy(), index=col_names_index, name=index_values[i]
+                )
+
+                results[i] = jitted_udf(ser)
+
+            return results
+
+        return numba_func
+
+    def apply_with_numba(self) -> dict[int, Any]:
+        nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs)
+
+        # Unpack the index and repack it inside the jitted numba function
+        # This is since if we have object dtype and strings we want to convert
+        # to a numpy string dtype (and our regular index doesn't support numpy string dtypes)
+        col_names_values = self.columns._data
+        if col_names_values.dtype == object:
+            if not lib.is_string_array(col_names_values):
+                raise ValueError(
+                    "The numba engine only supports using string or numeric column names"
+                )
+            col_names_values = col_names_values.astype("U")
+        index_values = self.obj.index.values
+
+        # Convert from numba dict to regular dict
+        # Our isinstance checks in the df constructor don't pass for numbas typed dict
+        result_nb_dict = nb_func(self.values, col_names_values, index_values)
+        result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values()
+        return dict(zip(result_keys, result_values))
+        # return dict(nb_func(self.values, col_names_values, index_values))
+
     @property
     def result_index(self) -> Index:
         return self.index
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 3a3f73a68374b..d541886cecb09 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -94,14 +94,14 @@ def test_apply_empty(func):
     assert result.empty
 
 
-def test_apply_float_frame(float_frame):
+def test_apply_float_frame(float_frame, engine):
     no_rows = float_frame[:0]
-    result = no_rows.apply(lambda x: x.mean())
+    result = no_rows.apply(lambda x: x.mean(), engine=engine)
     expected = Series(np.nan, index=float_frame.columns)
     tm.assert_series_equal(result, expected)
 
     no_cols = float_frame.loc[:, []]
-    result = no_cols.apply(lambda x: x.mean(), axis=1)
+    result = no_cols.apply(lambda x: x.mean(), axis=1, engine=engine)
     expected = Series(np.nan, index=float_frame.index)
     tm.assert_series_equal(result, expected)
 

From 0ac544d750daae64377940ff6d95c5a1909cdcc0 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 13 Sep 2023 22:45:47 -0400
Subject: [PATCH 02/18] complete?

---
 pandas/core/_numba/extensions.py | 99 ++++++++++++++++++++++++++------
 pandas/core/apply.py             | 50 ++++++++--------
 2 files changed, 108 insertions(+), 41 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index 1627b4fc3987f..d80f93e8b75b2 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -63,7 +63,7 @@ def copy(self, dtype=None, ndim: int = 1, layout=None):
         return type(self)(dtype, layout, self.pyclass)
 
 
-class SeriesType(types.ArrayCompatible):
+class SeriesType(types.Type):
     """
     The type class for Series objects.
     """
@@ -150,6 +150,10 @@ def __init__(self, dmm, fe_type) -> None:
             # typed dict
             # It maps from values in the index to their integer positions in the array
             ("hashmap", types.DictType(fe_type.dtype, types.intp)),
+            # Pointer to the Index object this was created from, or that it
+            # boxes to
+            # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
+            ("parent", types.pyobject)
         ]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
@@ -195,8 +199,20 @@ def pdseries_constructor(context, builder, sig, args):
     return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue())
 
 
-@lower_builtin(Index, types.Array, types.DictType)
+@lower_builtin(Index, types.Array, types.DictType, types.pyobject)
 def index_constructor_2arg(context, builder, sig, args):
+    (data, hashmap, parent) = args
+    index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+
+    index.data = data
+    index.hashmap = hashmap
+    index.parent = parent
+    return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
+
+@lower_builtin(Index, types.Array, types.DictType)
+def index_constructor_2arg_parent(context, builder, sig, args):
+    # Basically same as index_constructor_1arg, but also lets you specify the
+    # parent object
     (data, hashmap) = args
     index = cgutils.create_struct_proxy(sig.return_type)(context, builder)
 
@@ -230,9 +246,11 @@ def unbox_index(typ, obj, c):
     """
     data_obj = c.pyapi.object_getattr_string(obj, "_data")
     index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
+    # If we see an object array, assume its been validated as only containing strings
+    # We still need to do the conversion though
     index.data = c.unbox(typ.as_array, data_obj).value
     typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
-    # Create an empty typed dict in numba
+    # Create an empty typed dict in numba for the hasmap for indexing
     # equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
     arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
     intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
@@ -240,6 +258,8 @@ def unbox_index(typ, obj, c):
         typed_dict_obj, "empty", (arr_type_obj, intp_type_obj)
     )
     index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value
+    # Set the parent for speedy boxing.
+    index.parent = obj
 
     # Decrefs
     c.pyapi.decref(data_obj)
@@ -283,19 +303,36 @@ def box_index(typ, val, c):
     # First build a Numpy array object, then wrap it in a Index
     index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
 
-    # TODO: preserve the original class for the index
-    # Also need preserve the name of the Index
-
-    # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
-    class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
-    array_obj = c.box(typ.as_array, index.data)
-    # this is basically Index._simple_new(array_obj, name_obj) in python
-    index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
-
-    # Decrefs
-    c.pyapi.decref(class_obj)
-    c.pyapi.decref(array_obj)
-    return index_obj
+    res = cgutils.alloca_once_value(c.builder, index.parent)
+
+    # Does parent exist?
+    # (it means already boxed once, or Index same as original df.index or df.columns)
+    # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
+    with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (has_parent, otherwise):
+        with has_parent:
+            c.pyapi.incref(index.parent)
+        with otherwise:
+            # TODO: preserve the original class for the index
+            # Also need preserve the name of the Index
+            # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass))
+            class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index))
+            array_obj = c.box(typ.as_array, index.data)
+            if isinstance(typ.dtype, types.UnicodeCharSeq):
+                # We converted to numpy string dtype, convert back
+                # to object since _simple_new won't do that for uss
+                object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object"))
+                array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,))
+                c.pyapi.decref(object_str_obj)
+            # this is basically Index._simple_new(array_obj, name_obj) in python
+            index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
+            index.parent = index_obj
+            c.pyapi.print_object(index.parent)
+            c.builder.store(index_obj, res)
+
+            # Decrefs
+            c.pyapi.decref(class_obj)
+            c.pyapi.decref(array_obj)
+    return c.builder.load(res)
 
 
 @box(SeriesType)
@@ -333,7 +370,8 @@ def box_series(typ, val, c):
     return series_obj
 
 
-# Add common series reductions
+# Add common series reductions (e.g. mean, sum),
+# and also add common binops (e.g. add, sub, mul, div)
 
 
 def generate_series_reduction(reduction, reduction_method):
@@ -347,6 +385,23 @@ def series_reduction_impl(series):
     return series_reduction
 
 
+def generate_series_binop(binop):
+    @overload(binop)
+    def series_binop(series1, value):
+        if isinstance(series1, SeriesType):
+            if isinstance(value, SeriesType):
+                def series_binop_impl(series1, series2):
+                    # TODO: Check index matching?
+                    return Series(binop(series1.values, series2.values), series1.index, series1.name)
+                return series_binop_impl
+            else:
+                def series_binop_impl(series1, value):
+                    return Series(binop(series1.values, value), series1.index, series1.name)
+                return series_binop_impl
+
+    return series_binop
+
+
 series_reductions = [
     ("sum", np.sum),
     ("mean", np.mean),
@@ -356,6 +411,16 @@ def series_reduction_impl(series):
 for reduction, reduction_method in series_reductions:
     generate_series_reduction(reduction, reduction_method)
 
+series_binops = [
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv
+]
+
+for binop in series_binops:
+    generate_series_binop(binop)
+
 
 # get_loc on Index
 @overload_method(IndexType, "get_loc")
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index b1cc6fa79b819..12e879f053477 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -771,7 +771,7 @@ def __init__(
         if by_row is not False and by_row != "compat":
             raise ValueError(f"by_row={by_row} not allowed")
         self.engine = engine
-        self.engine_kwargs = engine_kwargs
+        self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs
         super().__init__(
             obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs
         )
@@ -826,11 +826,6 @@ def values(self):
     def apply(self) -> DataFrame | Series:
         """compute the results"""
 
-        if self.engine == "numba" and not self.raw:
-            raise ValueError(
-                "The numba engine in DataFrame.apply can only be used when raw=True"
-            )
-
         # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
             return self.apply_list_or_dict_like()
@@ -1009,9 +1004,6 @@ def apply_standard(self):
         else:
             results, res_index = self.apply_series_numba()
 
-        # print(results)
-        # print(res_index)
-
         # wrap results
         return self.wrap_results(results, res_index)
 
@@ -1083,6 +1075,10 @@ def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
     ) -> Callable[[npt.NDarray], dict[int, Any]]:
         from pandas import Series
+        # Dummy import just to make the extensions loaded in
+        # This isn't an entrypoint since we don't want users
+        # using Series/DF in numba code outside of apply
+        from pandas.core._numba.extensions import SeriesType
 
         numba = import_optional_dependency("numba")
 
@@ -1096,9 +1092,7 @@ def numba_func(values, col_names, df_index):
                 # TODO: No need for the str call?
                 # Need to adapt types to accept UnicodeCharSeq in Series constructor
                 ser = Series(values[:, j], index=df_index, name=str(col_names[j]))
-
                 results[j] = jitted_udf(ser)
-
             return results
 
         return numba_func
@@ -1114,7 +1108,7 @@ def apply_with_numba(self) -> dict[int, Any]:
             col_names_values = col_names_values.astype("U")
         df_index = self.obj.index
 
-        return nb_func(self.values, col_names_values, df_index)
+        return dict(nb_func(self.values, col_names_values, df_index))
 
     @property
     def result_index(self) -> Index:
@@ -1204,7 +1198,10 @@ def series_generator(self) -> Generator[Series, None, None]:
     def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
     ) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]:
-        # Unused import just to register the extensions
+        # Dummy import just to make the extensions loaded in
+        # This isn't an entrypoint since we don't want users
+        # using Series/DF in numba code outside of apply
+        from pandas.core._numba.extensions import SeriesType
 
         from pandas import (
             Index,
@@ -1216,16 +1213,15 @@ def generate_numba_apply_func(
         jitted_udf = numba.extending.register_jitable(func)
 
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
-        def numba_func(values, col_names, index_values):
+        def numba_func(values, col_names_index, index_values):
             results = {}
-            col_names_index = Index(col_names)
+            #col_names_index = Index(col_names)
             for i in range(values.shape[0]):
                 # Create the series
                 # TODO: values corrupted without the copy
                 ser = Series(
                     values[i].copy(), index=col_names_index, name=index_values[i]
                 )
-
                 results[i] = jitted_udf(ser)
 
             return results
@@ -1235,22 +1231,28 @@ def numba_func(values, col_names, index_values):
     def apply_with_numba(self) -> dict[int, Any]:
         nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs)
 
-        # Unpack the index and repack it inside the jitted numba function
-        # This is since if we have object dtype and strings we want to convert
-        # to a numpy string dtype (and our regular index doesn't support numpy string dtypes)
-        col_names_values = self.columns._data
-        if col_names_values.dtype == object:
-            if not lib.is_string_array(col_names_values):
+        # Since numpy/numba doesn't support object array of stringswell
+        # we'll do a sketchy thing where if index._data is object
+        # we convert to string and directly set index._data to that,
+        # setting it back after we call the function
+        fixed_obj_dtype = False
+        orig_data = self.columns._data
+        if self.columns._data.dtype == object:
+            if not lib.is_string_array(self.columns._data):
                 raise ValueError(
                     "The numba engine only supports using string or numeric column names"
                 )
-            col_names_values = col_names_values.astype("U")
+            # Remember to set this back!!!
+            self.columns._data = self.columns._data.astype("U")
+            fixed_obj_dtype = True
         index_values = self.obj.index.values
 
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
-        result_nb_dict = nb_func(self.values, col_names_values, index_values)
+        result_nb_dict = nb_func(self.values, self.columns, index_values)
         result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values()
+        if fixed_obj_dtype:
+            self.columns._data = orig_data
         return dict(zip(result_keys, result_values))
         # return dict(nb_func(self.values, col_names_values, index_values))
 

From 31b9e20d42213875acf003d5943026f626fcfdca Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 19 Sep 2023 18:49:14 -0400
Subject: [PATCH 03/18] wip: pass tests

---
 pandas/core/_numba/extensions.py       |  29 ++++---
 pandas/core/apply.py                   |  32 ++++++--
 pandas/tests/apply/test_frame_apply.py | 107 +++++++++++++++----------
 3 files changed, 107 insertions(+), 61 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index d80f93e8b75b2..a23d63b6cdec2 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -153,7 +153,7 @@ def __init__(self, dmm, fe_type) -> None:
             # Pointer to the Index object this was created from, or that it
             # boxes to
             # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1
-            ("parent", types.pyobject)
+            ("parent", types.pyobject),
         ]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
@@ -209,6 +209,7 @@ def index_constructor_2arg(context, builder, sig, args):
     index.parent = parent
     return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue())
 
+
 @lower_builtin(Index, types.Array, types.DictType)
 def index_constructor_2arg_parent(context, builder, sig, args):
     # Basically same as index_constructor_1arg, but also lets you specify the
@@ -308,7 +309,10 @@ def box_index(typ, val, c):
     # Does parent exist?
     # (it means already boxed once, or Index same as original df.index or df.columns)
     # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17
-    with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (has_parent, otherwise):
+    with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (
+        has_parent,
+        otherwise,
+    ):
         with has_parent:
             c.pyapi.incref(index.parent)
         with otherwise:
@@ -390,13 +394,23 @@ def generate_series_binop(binop):
     def series_binop(series1, value):
         if isinstance(series1, SeriesType):
             if isinstance(value, SeriesType):
+
                 def series_binop_impl(series1, series2):
                     # TODO: Check index matching?
-                    return Series(binop(series1.values, series2.values), series1.index, series1.name)
+                    return Series(
+                        binop(series1.values, series2.values),
+                        series1.index,
+                        series1.name,
+                    )
+
                 return series_binop_impl
             else:
+
                 def series_binop_impl(series1, value):
-                    return Series(binop(series1.values, value), series1.index, series1.name)
+                    return Series(
+                        binop(series1.values, value), series1.index, series1.name
+                    )
+
                 return series_binop_impl
 
     return series_binop
@@ -411,12 +425,7 @@ def series_binop_impl(series1, value):
 for reduction, reduction_method in series_reductions:
     generate_series_reduction(reduction, reduction_method)
 
-series_binops = [
-    operator.add,
-    operator.sub,
-    operator.mul,
-    operator.truediv
-]
+series_binops = [operator.add, operator.sub, operator.mul, operator.truediv]
 
 for binop in series_binops:
     generate_series_binop(binop)
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 12e879f053477..725d7bf430b5d 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -603,6 +603,12 @@ def apply_list_or_dict_like(self) -> DataFrame | Series:
         result: Series, DataFrame, or None
             Result when self.func is a list-like or dict-like, None otherwise.
         """
+
+        if self.engine == "numba":
+            raise NotImplementedError(
+                "The 'numba' engine doesn't support list-like/dict likes of callables yet."
+            )
+
         if self.axis == 1 and isinstance(self.obj, ABCDataFrame):
             return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T
 
@@ -828,6 +834,10 @@ def apply(self) -> DataFrame | Series:
 
         # dispatch to handle list-like or dict-like
         if is_list_like(self.func):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support lists of callables yet"
+                )
             return self.apply_list_or_dict_like()
 
         # all empty
@@ -836,10 +846,18 @@ def apply(self) -> DataFrame | Series:
 
         # string dispatch
         if isinstance(self.func, str):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support using a string as the callable function"
+                )
             return self.apply_str()
 
         # ufunc
         elif isinstance(self.func, np.ufunc):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support using a numpy ufunc as the callable function"
+                )
             with np.errstate(all="ignore"):
                 results = self.obj._mgr.apply("apply", func=self.func)
             # _constructor will retain self.index and self.columns
@@ -847,6 +865,10 @@ def apply(self) -> DataFrame | Series:
 
         # broadcasting
         if self.result_type == "broadcast":
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support result_type='broadcast'"
+                )
             return self.apply_broadcast(self.obj)
 
         # one axis empty
@@ -1075,10 +1097,10 @@ def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
     ) -> Callable[[npt.NDarray], dict[int, Any]]:
         from pandas import Series
+
         # Dummy import just to make the extensions loaded in
         # This isn't an entrypoint since we don't want users
         # using Series/DF in numba code outside of apply
-        from pandas.core._numba.extensions import SeriesType
 
         numba = import_optional_dependency("numba")
 
@@ -1201,12 +1223,8 @@ def generate_numba_apply_func(
         # Dummy import just to make the extensions loaded in
         # This isn't an entrypoint since we don't want users
         # using Series/DF in numba code outside of apply
-        from pandas.core._numba.extensions import SeriesType
 
-        from pandas import (
-            Index,
-            Series,
-        )
+        from pandas import Series
 
         numba = import_optional_dependency("numba")
 
@@ -1215,7 +1233,7 @@ def generate_numba_apply_func(
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
         def numba_func(values, col_names_index, index_values):
             results = {}
-            #col_names_index = Index(col_names)
+            # col_names_index = Index(col_names)
             for i in range(values.shape[0]):
                 # Create the series
                 # TODO: values corrupted without the copy
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index c62cb33c60d94..9d6e585755031 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -25,28 +25,31 @@ def engine(request):
     return request.param
 
 
-def test_apply(float_frame):
+def test_apply(float_frame, engine, request):
+    if engine == "numba":
+        mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet")
+        request.node.add_marker(mark)
     with np.errstate(all="ignore"):
         # ufunc
         result = np.sqrt(float_frame["A"])
-        expected = float_frame.apply(np.sqrt)["A"]
+        expected = float_frame.apply(np.sqrt, engine=engine)["A"]
         tm.assert_series_equal(result, expected)
 
         # aggregator
-        result = float_frame.apply(np.mean)["A"]
+        result = float_frame.apply(np.mean, engine=engine)["A"]
         expected = np.mean(float_frame["A"])
         assert result == expected
 
         d = float_frame.index[0]
-        result = float_frame.apply(np.mean, axis=1)
+        result = float_frame.apply(np.mean, axis=1, engine=engine)
         expected = np.mean(float_frame.xs(d))
         assert result[d] == expected
         assert result.index is float_frame.index
 
 
 @pytest.mark.parametrize("axis", [0, 1])
-def test_apply_args(float_frame, axis):
-    result = float_frame.apply(lambda x, y: x + y, axis, args=(1,))
+def test_apply_args(float_frame, axis, engine):
+    result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), engine=engine)
     expected = float_frame + 1
     tm.assert_frame_equal(result, expected)
 
@@ -93,11 +96,11 @@ def test_apply_mixed_datetimelike():
 
 
 @pytest.mark.parametrize("func", [np.sqrt, np.mean])
-def test_apply_empty(func):
+def test_apply_empty(func, engine=engine):
     # empty
     empty_frame = DataFrame()
 
-    result = empty_frame.apply(func)
+    result = empty_frame.apply(func, engine=engine)
     assert result.empty
 
 
@@ -113,10 +116,10 @@ def test_apply_float_frame(float_frame, engine):
     tm.assert_series_equal(result, expected)
 
 
-def test_apply_empty_except_index():
+def test_apply_empty_except_index(engine):
     # GH 2476
     expected = DataFrame(index=["a"])
-    result = expected.apply(lambda x: x["a"], axis=1)
+    result = expected.apply(lambda x: x["a"], axis=1, engine=engine)
     tm.assert_frame_equal(result, expected)
 
 
@@ -320,12 +323,6 @@ def test_apply_empty_infer_type(ax, func, raw, axis, engine, request):
         test_res = func(np.array([], dtype="f8"))
         is_reduction = not isinstance(test_res, np.ndarray)
 
-        if engine == "numba" and raw is False:
-            mark = pytest.mark.xfail(
-                reason="numba engine only supports raw=True at the moment"
-            )
-            request.node.add_marker(mark)
-
         result = df.apply(func, axis=axis, engine=engine, raw=raw)
         if is_reduction:
             agg_axis = df._get_agg_axis(axis)
@@ -980,9 +977,12 @@ def test_result_type_shorter_list(int_frame_const_col):
     tm.assert_frame_equal(result, expected)
 
 
-def test_result_type_broadcast(int_frame_const_col):
+def test_result_type_broadcast(int_frame_const_col, request):
     # result_type should be consistent no matter which
     # path we take in the code
+    if engine == "numba":
+        mark = pytest.mark.xfail(reason="numba engine doesn't support list return")
+        request.node.add_marker(mark)
     df = int_frame_const_col
     # broadcast result
     result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast")
@@ -990,35 +990,54 @@ def test_result_type_broadcast(int_frame_const_col):
     tm.assert_frame_equal(result, expected)
 
 
-def test_result_type_broadcast_series_func(int_frame_const_col):
+def test_result_type_broadcast_series_func(int_frame_const_col, engine, request):
     # result_type should be consistent no matter which
     # path we take in the code
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba Series constructor only support ndarrays not list data"
+        )
+        request.node.add_marker(mark)
     df = int_frame_const_col
     columns = ["other", "col", "names"]
     result = df.apply(
-        lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast"
+        lambda x: Series([1, 2, 3], index=columns),
+        axis=1,
+        result_type="broadcast",
+        engine=engine,
     )
     expected = df.copy()
     tm.assert_frame_equal(result, expected)
 
 
-def test_result_type_series_result(int_frame_const_col):
+def test_result_type_series_result(int_frame_const_col, engine, request):
     # result_type should be consistent no matter which
     # path we take in the code
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba Series constructor only support ndarrays not list data"
+        )
+        request.node.add_marker(mark)
     df = int_frame_const_col
     # series result
-    result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1)
+    result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1, engine=engine)
     expected = df.copy()
     tm.assert_frame_equal(result, expected)
 
 
-def test_result_type_series_result_other_index(int_frame_const_col):
+def test_result_type_series_result_other_index(int_frame_const_col, engine, request):
     # result_type should be consistent no matter which
     # path we take in the code
+
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="no support in numba Series constructor for list of columns"
+        )
+        request.node.add_marker(mark)
     df = int_frame_const_col
     # series result with other index
     columns = ["other", "col", "names"]
-    result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1)
+    result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1, engine=engine)
     expected = df.copy()
     expected.columns = columns
     tm.assert_frame_equal(result, expected)
@@ -1378,25 +1397,34 @@ def f(x, a, b, c=3):
 
 
 @pytest.mark.parametrize("num_cols", [2, 3, 5])
-def test_frequency_is_original(num_cols):
+def test_frequency_is_original(num_cols, engine, request):
     # GH 22150
+    if engine == "numba":
+        mark = pytest.mark.xfail(reason="numba engine only supports numeric indices")
+        request.node.add_marker(mark)
     index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
     original = index.copy()
-    df = DataFrame(1, index=index, columns=range(num_cols))
+    df = DataFrame(1, index=index, columns=range(num_cols), engine=engine)
     df.apply(lambda x: x)
     assert index.freq == original.freq
 
 
-def test_apply_datetime_tz_issue():
+def test_apply_datetime_tz_issue(engine, request):
     # GH 29052
 
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba engine doesn't support non-numeric indexes"
+        )
+        request.node.add_marker(mark)
+
     timestamps = [
         Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"),
         Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"),
         Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"),
     ]
     df = DataFrame(data=[0, 1, 2], index=timestamps)
-    result = df.apply(lambda x: x.name, axis=1)
+    result = df.apply(lambda x: x.name, axis=1, engine=engine)
     expected = Series(index=timestamps, data=timestamps)
 
     tm.assert_series_equal(result, expected)
@@ -1459,10 +1487,10 @@ def test_apply_empty_list_reduce():
     tm.assert_series_equal(result, expected)
 
 
-def test_apply_no_suffix_index():
+def test_apply_no_suffix_index(engine):
     # GH36189
     pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"])
-    result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()])
+    result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine)
     expected = DataFrame(
         {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "<lambda>", "<lambda>"]
     )
@@ -1511,10 +1539,12 @@ def sum_div2(s):
     tm.assert_frame_equal(result, expected)
 
 
-def test_apply_getitem_axis_1():
+def test_apply_getitem_axis_1(engine):
     # GH 13427
     df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
-    result = df[["a", "a"]].apply(lambda x: x.iloc[0] + x.iloc[1], axis=1)
+    result = df[["a", "a"]].apply(
+        lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine
+    )
     expected = Series([0, 2, 4])
     tm.assert_series_equal(result, expected)
 
@@ -1554,10 +1584,10 @@ def test_apply_type():
     tm.assert_series_equal(result, expected)
 
 
-def test_apply_on_empty_dataframe():
+def test_apply_on_empty_dataframe(engine):
     # GH 39111
     df = DataFrame({"a": [1, 2], "b": [3, 0]})
-    result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1)
+    result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1, engine=engine)
     expected = Series([], dtype=np.float64)
     tm.assert_series_equal(result, expected)
 
@@ -1655,14 +1685,3 @@ def test_agg_dist_like_and_nonunique_columns():
     result = df.agg({"A": "count"})
     expected = df["A"].count()
     tm.assert_series_equal(result, expected)
-
-
-def test_numba_unsupported():
-    df = DataFrame(
-        {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
-    )
-    with pytest.raises(
-        ValueError,
-        match="The numba engine in DataFrame.apply can only be used when raw=True",
-    ):
-        df.apply(lambda x: x, engine="numba", raw=False)

From 55df7ad3e0b7da6cd056803381b2e937606bc82c Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 24 Sep 2023 12:39:08 -0400
Subject: [PATCH 04/18] fix existing tests

---
 pandas/core/apply.py                   | 20 ++++++++++++++++----
 pandas/tests/apply/test_frame_apply.py | 16 ++++++++++++----
 pandas/tests/apply/test_numba.py       |  0
 3 files changed, 28 insertions(+), 8 deletions(-)
 create mode 100644 pandas/tests/apply/test_numba.py

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 8bb5e8efa16c7..b550e576587df 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -123,6 +123,8 @@ def __init__(
         result_type: str | None,
         *,
         by_row: Literal[False, "compat", "_compat"] = "compat",
+        engine: str = "python",
+        engine_kwargs: dict[str, bool] | None = None,
         args,
         kwargs,
     ) -> None:
@@ -135,6 +137,9 @@ def __init__(
         self.args = args or ()
         self.kwargs = kwargs or {}
 
+        self.engine = engine
+        self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs
+
         if result_type not in [None, "reduce", "broadcast", "expand"]:
             raise ValueError(
                 "invalid value for result_type, must be one "
@@ -777,10 +782,16 @@ def __init__(
     ) -> None:
         if by_row is not False and by_row != "compat":
             raise ValueError(f"by_row={by_row} not allowed")
-        self.engine = engine
-        self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs
         super().__init__(
-            obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs
+            obj,
+            func,
+            raw,
+            result_type,
+            by_row=by_row,
+            engine=engine,
+            engine_kwargs=engine_kwargs,
+            args=args,
+            kwargs=kwargs,
         )
 
     # ---------------------------------------------------------------
@@ -1108,6 +1119,7 @@ def generate_numba_apply_func(
         # Dummy import just to make the extensions loaded in
         # This isn't an entrypoint since we don't want users
         # using Series/DF in numba code outside of apply
+        from pandas.core._numba.extensions import SeriesType  # noqa: F401
 
         numba = import_optional_dependency("numba")
 
@@ -1231,8 +1243,8 @@ def generate_numba_apply_func(
         # Dummy import just to make the extensions loaded in
         # This isn't an entrypoint since we don't want users
         # using Series/DF in numba code outside of apply
-
         from pandas import Series
+        from pandas.core._numba.extensions import SeriesType  # noqa: F401
 
         numba = import_optional_dependency("numba")
 
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 474cc862e3ff8..8767c7f0f45d1 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -49,7 +49,10 @@ def test_apply(float_frame, engine, request):
 
 @pytest.mark.parametrize("axis", [0, 1])
 @pytest.mark.parametrize("raw", [True, False])
-def test_apply_args(float_frame, axis, raw, engine):
+def test_apply_args(float_frame, axis, raw, engine, request):
+    if engine == "numba":
+        mark = pytest.mark.xfail(reason="numba engine doesn't support args")
+        request.node.add_marker(mark)
     result = float_frame.apply(
         lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine
     )
@@ -1407,8 +1410,8 @@ def test_frequency_is_original(num_cols, engine, request):
         request.node.add_marker(mark)
     index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
     original = index.copy()
-    df = DataFrame(1, index=index, columns=range(num_cols), engine=engine)
-    df.apply(lambda x: x)
+    df = DataFrame(1, index=index, columns=range(num_cols))
+    df.apply(lambda x: x, engine=engine)
     assert index.freq == original.freq
 
 
@@ -1490,8 +1493,13 @@ def test_apply_empty_list_reduce():
     tm.assert_series_equal(result, expected)
 
 
-def test_apply_no_suffix_index(engine):
+def test_apply_no_suffix_index(engine, request):
     # GH36189
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba engine doesn't support list-likes/dict-like callables"
+        )
+        request.node.add_marker(mark)
     pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"])
     result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine)
     expected = DataFrame(
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 3c89b0f68c841cad8da7b2bd064ff26c61d543c1 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 25 Sep 2023 09:57:13 -0400
Subject: [PATCH 05/18] go for green

---
 pandas/core/_numba/extensions.py      | 33 +++++++++++----
 pandas/core/apply.py                  | 59 +++++++++++++++------------
 pyright_reportGeneralTypeIssues.json  |  1 +
 scripts/validate_unwanted_patterns.py |  2 +
 4 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index a23d63b6cdec2..b6d0534368110 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -1,3 +1,6 @@
+# Disable type checking for this module since numba's internals
+# are not typed, and we use numba's internals via its extension API
+# mypy: ignore-errors
 """
 Utility classes/functions to let numba recognize
 pandas Index/Series/DataFrame
@@ -38,14 +41,18 @@
 
 # TODO: Range index support
 # (not passing an index to series constructor doesn't work)
-class IndexType(types.Buffer):
+class IndexType(types.Type):
     """
     The type class for Index objects.
     """
 
-    def __init__(self, dtype, layout, pyclass) -> None:
+    def __init__(self, dtype, layout, pyclass: any) -> None:
         self.pyclass = pyclass
-        super().__init__(dtype, 1, layout)
+        name = f"index({dtype}, {layout})"
+        self.dtype = dtype
+        self.layout = layout
+        super().__init__(name)
+        # super().__init__(dtype, 1, layout)
 
     @property
     def key(self):
@@ -190,7 +197,7 @@ def pdseries_constructor(context, builder, sig, args):
 @lower_builtin(Series, types.Array, IndexType, types.intp)
 @lower_builtin(Series, types.Array, IndexType, types.float64)
 @lower_builtin(Series, types.Array, IndexType, types.unicode_type)
-def pdseries_constructor(context, builder, sig, args):
+def pdseries_constructor_with_name(context, builder, sig, args):
     data, index, name = args
     series = cgutils.create_struct_proxy(sig.return_type)(context, builder)
     series.index = index
@@ -351,7 +358,9 @@ def box_series(typ, val, c):
     name_obj = c.box(typ.namety, series.name)
     true_obj = c.pyapi.unserialize(c.pyapi.serialize_object(True))
     # TODO: Is borrowing none here safe?
-    # This is equivalent of pd.Series(data=array_obj, index=index_obj, dtype=None, name=name_obj, copy=None, fastpath=True)
+    # This is equivalent of
+    # pd.Series(data=array_obj, index=index_obj, dtype=None,
+    #           name=name_obj, copy=None, fastpath=True)
     series_obj = c.pyapi.call_function_objargs(
         class_obj,
         (
@@ -435,7 +444,7 @@ def series_binop_impl(series1, value):
 @overload_method(IndexType, "get_loc")
 def index_get_loc(index, item):
     def index_get_loc_impl(index, item):
-        # Initialize the hash table if not initalized
+        # Initialize the hash table if not initialized
         if len(index.hashmap) == 0:
             for i, val in enumerate(index._data):
                 index.hashmap[val] = i
@@ -444,7 +453,7 @@ def index_get_loc_impl(index, item):
     return index_get_loc_impl
 
 
-# Indexing for Series
+# Indexing for Series/Index
 
 
 @overload(operator.getitem)
@@ -458,6 +467,16 @@ def series_getitem(series, item):
         return series_getitem
 
 
+@overload(operator.getitem)
+def index_indexing(index, idx):
+    if isinstance(index, IndexType):
+
+        def index_getitem(index, idx):
+            return index._data[idx]
+
+        return index_getitem
+
+
 class IlocType(types.Type):
     def __init__(self, obj_type) -> None:
         self.obj_type = obj_type
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index b550e576587df..b79399af6af29 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -812,9 +812,11 @@ def result_columns(self) -> Index:
     def series_generator(self) -> Generator[Series, None, None]:
         pass
 
-    @property
+    @functools.cache
     @abc.abstractmethod
-    def generate_numba_apply_func(self) -> Callable[[npt.NDarray], dict[int, Any]]:
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
         pass
 
     @abc.abstractmethod
@@ -1113,7 +1115,7 @@ def series_generator(self) -> Generator[Series, None, None]:
     @functools.cache
     def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
-    ) -> Callable[[npt.NDarray], dict[int, Any]]:
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
         from pandas import Series
 
         # Dummy import just to make the extensions loaded in
@@ -1130,8 +1132,6 @@ def numba_func(values, col_names, df_index):
             results = {}
             for j in range(values.shape[1]):
                 # Create the series
-                # TODO: No need for the str call?
-                # Need to adapt types to accept UnicodeCharSeq in Series constructor
                 ser = Series(values[:, j], index=df_index, name=str(col_names[j]))
                 results[j] = jitted_udf(ser)
             return results
@@ -1139,18 +1139,27 @@ def numba_func(values, col_names, df_index):
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
-        nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs)
-        col_names_values = self.columns._data
-        if col_names_values.dtype == object:
-            if not lib.is_string_array(col_names_values):
+        nb_func = self.generate_numba_apply_func(
+            cast(Callable, self.func), **self.engine_kwargs
+        )
+        orig_values = self.columns.to_numpy()
+        fixed_cols = False
+        if orig_values.dtype == object:
+            if not lib.is_string_array(orig_values):
                 raise ValueError(
                     "The numba engine only supports "
                     "using string or numeric column names"
                 )
-            col_names_values = col_names_values.astype("U")
+            col_names_values = orig_values.astype("U")
+            # Remember to set this back!
+            self.columns._data = col_names_values
+            fixed_cols = True
         df_index = self.obj.index
 
-        return dict(nb_func(self.values, col_names_values, df_index))
+        res = dict(nb_func(self.values, self.columns, df_index))
+        if fixed_cols:
+            self.columns._data = orig_values
+        return res
 
     @property
     def result_index(self) -> Index:
@@ -1239,7 +1248,7 @@ def series_generator(self) -> Generator[Series, None, None]:
     @functools.cache
     def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
-    ) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]:
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
         # Dummy import just to make the extensions loaded in
         # This isn't an entrypoint since we don't want users
         # using Series/DF in numba code outside of apply
@@ -1251,15 +1260,12 @@ def generate_numba_apply_func(
         jitted_udf = numba.extending.register_jitable(func)
 
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
-        def numba_func(values, col_names_index, index_values):
+        def numba_func(values, col_names_index, index):
             results = {}
-            # col_names_index = Index(col_names)
             for i in range(values.shape[0]):
                 # Create the series
                 # TODO: values corrupted without the copy
-                ser = Series(
-                    values[i].copy(), index=col_names_index, name=index_values[i]
-                )
+                ser = Series(values[i].copy(), index=col_names_index, name=index[i])
                 results[i] = jitted_udf(ser)
 
             return results
@@ -1267,33 +1273,34 @@ def numba_func(values, col_names_index, index_values):
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
-        nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs)
+        nb_func = self.generate_numba_apply_func(
+            cast(Callable, self.func), **self.engine_kwargs
+        )
 
         # Since numpy/numba doesn't support object array of stringswell
         # we'll do a sketchy thing where if index._data is object
         # we convert to string and directly set index._data to that,
         # setting it back after we call the function
         fixed_obj_dtype = False
-        orig_data = self.columns._data
+        orig_data = self.columns.to_numpy()
         if self.columns._data.dtype == object:
-            if not lib.is_string_array(self.columns._data):
+            if not lib.is_string_array(orig_data):
                 raise ValueError(
                     "The numba engine only supports "
                     "using string or numeric column names"
                 )
             # Remember to set this back!!!
-            self.columns._data = self.columns._data.astype("U")
+            self.columns._data = orig_data.astype("U")
             fixed_obj_dtype = True
-        index_values = self.obj.index.values
 
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
-        result_nb_dict = nb_func(self.values, self.columns, index_values)
-        result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values()
+        res = dict(nb_func(self.values, self.columns, self.obj.index))
+
         if fixed_obj_dtype:
             self.columns._data = orig_data
-        return dict(zip(result_keys, result_values))
-        # return dict(nb_func(self.values, col_names_values, index_values))
+
+        return res
 
     @property
     def result_index(self) -> Index:
diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json
index cad43632930ba..c059b9c589ecd 100644
--- a/pyright_reportGeneralTypeIssues.json
+++ b/pyright_reportGeneralTypeIssues.json
@@ -16,6 +16,7 @@
 
         "pandas/_testing/__init__.py",
         "pandas/_testing/_io.py",
+        "pandas/core/_numba/extensions.py",
         "pandas/core/_numba/kernels/sum_.py",
         "pandas/core/_numba/kernels/var_.py",
         "pandas/compat/pickle_compat.py",
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
index d765d7bc7dcb9..6e6251425928d 100755
--- a/scripts/validate_unwanted_patterns.py
+++ b/scripts/validate_unwanted_patterns.py
@@ -51,6 +51,8 @@
     "_chained_assignment_msg",
     "_chained_assignment_method_msg",
     "_version_meson",
+    # The numba extensions need this to mock the iloc object
+    "_iLocIndexer",
     # TODO(3.0): GH#55043 - remove upon removal of ArrayManager
     "_get_option",
 }

From 1418d3e743620a3a40659ace96fad6375b29a613 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 25 Sep 2023 13:46:51 -0400
Subject: [PATCH 06/18] fix checks?

---
 pandas/core/_numba/extensions.py | 10 +++++-----
 pandas/core/apply.py             |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index b6d0534368110..da3ca86628b6b 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -387,11 +387,11 @@ def box_series(typ, val, c):
 # and also add common binops (e.g. add, sub, mul, div)
 
 
-def generate_series_reduction(reduction, reduction_method):
-    @overload_method(SeriesType, reduction)
+def generate_series_reduction(ser_reduction, ser_method):
+    @overload_method(SeriesType, ser_reduction)
     def series_reduction(series):
         def series_reduction_impl(series):
-            return reduction_method(series.values)
+            return ser_method(series.values)
 
         return series_reduction_impl
 
@@ -436,8 +436,8 @@ def series_binop_impl(series1, value):
 
 series_binops = [operator.add, operator.sub, operator.mul, operator.truediv]
 
-for binop in series_binops:
-    generate_series_binop(binop)
+for ser_binop in series_binops:
+    generate_series_binop(ser_binop)
 
 
 # get_loc on Index
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index b79399af6af29..e9bed6d56a9a0 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -812,6 +812,7 @@ def result_columns(self) -> Index:
     def series_generator(self) -> Generator[Series, None, None]:
         pass
 
+    @staticmethod
     @functools.cache
     @abc.abstractmethod
     def generate_numba_apply_func(

From c143c677280fc5c9d455019d29f7daaa67453185 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 25 Sep 2023 14:21:09 -0400
Subject: [PATCH 07/18] fix pyright

---
 pandas/core/apply.py | 5 +++++
 pyproject.toml       | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index e9bed6d56a9a0..73a76e9121c2e 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1,3 +1,8 @@
+# pyright: reportUnusedImport=false
+# Disabled since there's no way to do an ignore for both pyright
+# and ruff, and ruff should be sufficient
+# (The reason we need this is because the import of the numba extensions is unused
+# but is necessary to register the extensions)
 from __future__ import annotations
 
 import abc
diff --git a/pyproject.toml b/pyproject.toml
index 4e1c77413efda..929376b42e78e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -735,7 +735,7 @@ pythonVersion = "3.11"
 typeCheckingMode = "basic"
 useLibraryCodeForTypes = false
 include = ["pandas", "typings"]
-exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version"]
+exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version", "pandas/core/_numba/extensions.py"]
 # enable subset of "strict"
 reportDuplicateImport = true
 reportInconsistentConstructor = true

From 0d827c4e2b009e6fc9b49b7ddb38ee3555a60dd7 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 28 Sep 2023 16:45:24 -0400
Subject: [PATCH 08/18] update docs

---
 pandas/core/_numba/extensions.py | 2 --
 pandas/core/frame.py             | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index da3ca86628b6b..9eb232b29b8bd 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -52,7 +52,6 @@ def __init__(self, dtype, layout, pyclass: any) -> None:
         self.dtype = dtype
         self.layout = layout
         super().__init__(name)
-        # super().__init__(dtype, 1, layout)
 
     @property
     def key(self):
@@ -126,7 +125,6 @@ def typeof_series(val, c):
 def type_series_constructor(context):
     def typer(data, index, name=None):
         if isinstance(index, IndexType) and isinstance(data, types.Array):
-            # assert data.layout == "C"
             assert data.ndim == 1
             if name is None:
                 name = types.intp
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 3e32a6d93b023..7931bf232c75a 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -10001,6 +10001,10 @@ def apply(
             - nogil (release the GIL inside the JIT compiled function)
             - parallel (try to apply the function in parallel over the DataFrame)
 
+              Note: Due to limitations within numba/how pandas interfaces with numba,
+              you should only use this if raw=True
+
+
             Note: The numba compiler only supports a subset of
             valid Python/numpy operations.
 
@@ -10010,8 +10014,6 @@ def apply(
             <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
             in numba to learn what you can or cannot use in the passed function.
 
-            As of right now, the numba engine can only be used with raw=True.
-
             .. versionadded:: 2.2.0
 
         engine_kwargs : dict

From f4e80a6a73ad908c1cfa3847f59667c6c164df19 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 29 Sep 2023 17:51:46 -0400
Subject: [PATCH 09/18] eliminate a blank line

---
 pandas/core/frame.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 681caef6c74fe..033f8e6d913c8 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -10054,7 +10054,6 @@ def apply(
               Note: Due to limitations within numba/how pandas interfaces with numba,
               you should only use this if raw=True
 
-
             Note: The numba compiler only supports a subset of
             valid Python/numpy operations.
 

From 21e2186488c533e821dd9c03698de31d8897b676 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 6 Oct 2023 21:02:18 -0400
Subject: [PATCH 10/18] update from code review + more tests

---
 pandas/core/_numba/extensions.py       | 36 ++++++++---
 pandas/core/apply.py                   | 86 ++++++++++++++++++++------
 pandas/tests/apply/conftest.py         | 12 ++++
 pandas/tests/apply/test_frame_apply.py | 22 +++----
 pandas/tests/apply/test_numba.py       | 64 +++++++++++++++++++
 5 files changed, 183 insertions(+), 37 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index 9eb232b29b8bd..9bcb27b964141 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -13,10 +13,8 @@
 import operator
 
 import numba
-from numba.core import (
-    cgutils,
-    types,
-)
+from numba import types
+from numba.core import cgutils
 from numba.core.datamodel import models
 from numba.core.extending import (
     NativeValue,
@@ -40,7 +38,7 @@
 
 
 # TODO: Range index support
-# (not passing an index to series constructor doesn't work)
+# (this currently lowers OK, but does not round-trip)
 class IndexType(types.Type):
     """
     The type class for Index objects.
@@ -149,6 +147,7 @@ def typer(data, hashmap=None):
 @register_model(IndexType)
 class IndexModel(models.StructModel):
     def __init__(self, dmm, fe_type) -> None:
+        # We don't want the numpy string scalar type in our hashmap
         members = [
             ("data", fe_type.as_array),
             # This is an attempt to emulate our hashtable code with a numba
@@ -240,6 +239,25 @@ def index_impl(data):
     return context.compile_internal(builder, index_impl, sig, args)
 
 
+# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
+# (regular string)
+
+
+def maybe_cast_str(x):
+    # Dummy function that numba can overload
+    pass
+
+
+@overload(maybe_cast_str)
+def maybe_cast_str_impl(x):
+    """Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string).
+    Is a no-op for other types."""
+    if isinstance(x, types.UnicodeCharSeq):
+        return lambda x: str(x)
+    else:
+        return lambda x: x
+
+
 @unbox(IndexType)
 def unbox_index(typ, obj, c):
     """
@@ -426,8 +444,12 @@ def series_binop_impl(series1, value):
 series_reductions = [
     ("sum", np.sum),
     ("mean", np.mean),
-    ("std", np.std),
-    ("var", np.var),
+    # Disabled due to discrepancies between numba std. dev
+    # and pandas std. dev (no way to specify dof)
+    # ("std", np.std),
+    # ("var", np.var),
+    ("min", np.min),
+    ("max", np.max),
 ]
 for reduction, reduction_method in series_reductions:
     generate_series_reduction(reduction, reduction_method)
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 73a76e9121c2e..71c60fd60ad8a 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1075,6 +1075,14 @@ def apply_series_generator(self) -> tuple[ResType, Index]:
         return results, res_index
 
     def apply_series_numba(self):
+        if self.engine_kwargs.get("parallel", False):
+            raise NotImplementedError(
+                "Parallel apply is not supported when raw=False and engine='numba'"
+            )
+        if not self.obj.index.is_unique or not self.columns.is_unique:
+            raise NotImplementedError(
+                "The index/columns must be unique when raw=False and engine='numba'"
+            )
         results = self.apply_with_numba()
         return results, self.result_index
 
@@ -1128,6 +1136,7 @@ def generate_numba_apply_func(
         # This isn't an entrypoint since we don't want users
         # using Series/DF in numba code outside of apply
         from pandas.core._numba.extensions import SeriesType  # noqa: F401
+        from pandas.core._numba.extensions import maybe_cast_str
 
         numba = import_optional_dependency("numba")
 
@@ -1138,7 +1147,9 @@ def numba_func(values, col_names, df_index):
             results = {}
             for j in range(values.shape[1]):
                 # Create the series
-                ser = Series(values[:, j], index=df_index, name=str(col_names[j]))
+                ser = Series(
+                    values[:, j], index=df_index, name=maybe_cast_str(col_names[j])
+                )
                 results[j] = jitted_udf(ser)
             return results
 
@@ -1148,23 +1159,40 @@ def apply_with_numba(self) -> dict[int, Any]:
         nb_func = self.generate_numba_apply_func(
             cast(Callable, self.func), **self.engine_kwargs
         )
-        orig_values = self.columns.to_numpy()
-        fixed_cols = False
-        if orig_values.dtype == object:
-            if not lib.is_string_array(orig_values):
+        # Since numpy/numba doesn't support object array of stringswell
+        # we'll do a sketchy thing where if index._data is object
+        # we convert to string and directly set index._data to that,
+        # setting it back after we call the function
+        fixed_obj_colnames = False
+        orig_cols = self.columns.to_numpy()
+        if self.columns._data.dtype == object:
+            if not lib.is_string_array(orig_cols):
                 raise ValueError(
                     "The numba engine only supports "
                     "using string or numeric column names"
                 )
-            col_names_values = orig_values.astype("U")
-            # Remember to set this back!
-            self.columns._data = col_names_values
-            fixed_cols = True
+            # Remember to set this back!!!
+            self.columns._data = orig_cols.astype("U")
+            fixed_obj_colnames = True
+
+        fixed_obj_index = False
+        orig_index = self.index.to_numpy()
+        if self.obj.index._data.dtype == object:
+            if not lib.is_string_array(orig_index):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric index values"
+                )
+            # Remember to set this back!!!
+            self.obj.index._data = orig_index.astype("U")
+            fixed_obj_index = True
         df_index = self.obj.index
 
         res = dict(nb_func(self.values, self.columns, df_index))
-        if fixed_cols:
-            self.columns._data = orig_values
+        if fixed_obj_colnames:
+            self.columns._data = orig_cols
+        if fixed_obj_index:
+            self.obj.index._data = orig_index
         return res
 
     @property
@@ -1260,6 +1288,7 @@ def generate_numba_apply_func(
         # using Series/DF in numba code outside of apply
         from pandas import Series
         from pandas.core._numba.extensions import SeriesType  # noqa: F401
+        from pandas.core._numba.extensions import maybe_cast_str
 
         numba = import_optional_dependency("numba")
 
@@ -1271,7 +1300,11 @@ def numba_func(values, col_names_index, index):
             for i in range(values.shape[0]):
                 # Create the series
                 # TODO: values corrupted without the copy
-                ser = Series(values[i].copy(), index=col_names_index, name=index[i])
+                ser = Series(
+                    values[i].copy(),
+                    index=col_names_index,
+                    name=maybe_cast_str(index[i]),
+                )
                 results[i] = jitted_udf(ser)
 
             return results
@@ -1287,24 +1320,39 @@ def apply_with_numba(self) -> dict[int, Any]:
         # we'll do a sketchy thing where if index._data is object
         # we convert to string and directly set index._data to that,
         # setting it back after we call the function
-        fixed_obj_dtype = False
-        orig_data = self.columns.to_numpy()
+        fixed_obj_colnames = False
+        orig_cols = self.columns.to_numpy()
         if self.columns._data.dtype == object:
-            if not lib.is_string_array(orig_data):
+            if not lib.is_string_array(orig_cols):
                 raise ValueError(
                     "The numba engine only supports "
                     "using string or numeric column names"
                 )
             # Remember to set this back!!!
-            self.columns._data = orig_data.astype("U")
-            fixed_obj_dtype = True
+            self.columns._data = orig_cols.astype("U")
+            fixed_obj_colnames = True
+
+        fixed_obj_index = False
+        orig_index = self.index.to_numpy()
+        if self.obj.index._data.dtype == object:
+            if not lib.is_string_array(orig_index):
+                raise ValueError(
+                    "The numba engine only supports "
+                    "using string or numeric index values"
+                )
+            # Remember to set this back!!!
+            self.obj.index._data = orig_index.astype("U")
+            fixed_obj_index = True
 
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
         res = dict(nb_func(self.values, self.columns, self.obj.index))
 
-        if fixed_obj_dtype:
-            self.columns._data = orig_data
+        if fixed_obj_colnames:
+            self.columns._data = orig_cols
+
+        if fixed_obj_index:
+            self.obj.index._data = orig_index
 
         return res
 
diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py
index b68c6235cb0b8..7ed9fc88c3aea 100644
--- a/pandas/tests/apply/conftest.py
+++ b/pandas/tests/apply/conftest.py
@@ -16,3 +16,15 @@ def int_frame_const_col():
         columns=["A", "B", "C"],
     )
     return df
+
+
+@pytest.fixture(params=["python", "numba"])
+def engine(request):
+    if request.param == "numba":
+        pytest.importorskip("numba")
+    return request.param
+
+
+@pytest.fixture(params=[0, 1])
+def apply_axis(request):
+    return request.param
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 8767c7f0f45d1..a3a95ccd75064 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -18,13 +18,6 @@
 from pandas.tests.frame.common import zip_frames
 
 
-@pytest.fixture(params=["python", "numba"])
-def engine(request):
-    if request.param == "numba":
-        pytest.importorskip("numba")
-    return request.param
-
-
 def test_apply(float_frame, engine, request):
     if engine == "numba":
         mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet")
@@ -102,7 +95,7 @@ def test_apply_mixed_datetimelike():
 
 
 @pytest.mark.parametrize("func", [np.sqrt, np.mean])
-def test_apply_empty(func, engine=engine):
+def test_apply_empty(func, engine):
     # empty
     empty_frame = DataFrame()
 
@@ -983,7 +976,7 @@ def test_result_type_shorter_list(int_frame_const_col):
     tm.assert_frame_equal(result, expected)
 
 
-def test_result_type_broadcast(int_frame_const_col, request):
+def test_result_type_broadcast(int_frame_const_col, request, engine):
     # result_type should be consistent no matter which
     # path we take in the code
     if engine == "numba":
@@ -991,7 +984,9 @@ def test_result_type_broadcast(int_frame_const_col, request):
         request.node.add_marker(mark)
     df = int_frame_const_col
     # broadcast result
-    result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast")
+    result = df.apply(
+        lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine
+    )
     expected = df.copy()
     tm.assert_frame_equal(result, expected)
 
@@ -1550,8 +1545,13 @@ def sum_div2(s):
     tm.assert_frame_equal(result, expected)
 
 
-def test_apply_getitem_axis_1(engine):
+def test_apply_getitem_axis_1(engine, request):
     # GH 13427
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba engine not supporting duplicate index values"
+        )
+        request.node.add_marker(mark)
     df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
     result = df[["a", "a"]].apply(
         lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index e69de29bb2d1d..9e05b5316dc15 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -0,0 +1,64 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+def test_numba_vs_python_noop(float_frame, apply_axis):
+    func = lambda x: x
+    result = float_frame.apply(func, engine="numba", axis=apply_axis)
+    expected = float_frame.apply(func, engine="python", axis=apply_axis)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_numba_vs_python_indexing(float_frame):
+    row_func = lambda x: x["A"]
+    result = float_frame.apply(row_func, engine="numba", axis=1)
+    expected = float_frame.apply(row_func, engine="python", axis=1)
+    tm.assert_series_equal(result, expected)
+
+    row_func = lambda x: x["ZqgszYBfuL"]  # This is a label in the index
+    result = float_frame.apply(row_func, engine="numba", axis=0)
+    expected = float_frame.apply(row_func, engine="python", axis=0)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "reduction",
+    [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()],
+)
+def test_numba_vs_python_reductions(float_frame, reduction, apply_axis):
+    result = float_frame.apply(reduction, engine="numba", axis=apply_axis)
+    expected = float_frame.apply(reduction, engine="python", axis=apply_axis)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]])
+def test_numba_numeric_colnames(colnames):
+    # Check that numeric column names lower properly and can be indxed on
+    df = DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=colnames)
+    first_col = colnames[0]
+    f = lambda x: x[first_col]  # Get the first column
+    result = df.apply(f, engine="numba", axis=1)
+    expected = df.apply(f, engine="python", axis=1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_numba_parallel_unsupported(float_frame):
+    f = lambda x: x
+    with pytest.raises(
+        NotImplementedError,
+        match="Parallel apply is not supported when raw=False and engine='numba'",
+    ):
+        float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True})
+
+
+def test_numba_nonunique_unsupported():
+    f = lambda x: x
+    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    with pytest.raises(
+        NotImplementedError,
+        match="The index/columns must be unique when raw=False and engine='numba'",
+    ):
+        df.apply(f, engine="numba", engine_kwargs={"parallel": True})

From ba1d0e0b31767928c6e95e7353325bdd8a262d79 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 10 Oct 2023 10:02:30 -0400
Subject: [PATCH 11/18] fix failing tests

---
 pandas/tests/apply/test_numba.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index 9e05b5316dc15..dd186bb8a57bb 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -1,7 +1,10 @@
 import numpy as np
 import pytest
 
-from pandas import DataFrame
+from pandas import (
+    DataFrame,
+    Index,
+)
 import pandas._testing as tm
 
 
@@ -54,11 +57,11 @@ def test_numba_parallel_unsupported(float_frame):
         float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True})
 
 
-def test_numba_nonunique_unsupported():
+def test_numba_nonunique_unsupported(apply_axis):
     f = lambda x: x
-    df = DataFrame({"a": [1, 2], "b": [1, 2]})
+    df = DataFrame({"a": [1, 2]}, index=Index(["a", "a"]))
     with pytest.raises(
         NotImplementedError,
         match="The index/columns must be unique when raw=False and engine='numba'",
     ):
-        df.apply(f, engine="numba", engine_kwargs={"parallel": True})
+        df.apply(f, engine="numba", axis=apply_axis)

From 088d27f25e8a892e0aa3ce9336540ce40e4ae7fe Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 12 Oct 2023 11:16:28 -0400
Subject: [PATCH 12/18] Simplify w/ context manager

---
 pandas/core/_numba/extensions.py | 39 ++++++++++++---
 pandas/core/apply.py             | 83 ++++++--------------------------
 2 files changed, 45 insertions(+), 77 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index 9bcb27b964141..4e1f59c6d4e96 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -10,6 +10,7 @@
 
 from __future__ import annotations
 
+from contextlib import contextmanager
 import operator
 
 import numba
@@ -32,11 +33,34 @@
 from numba.core.imputils import impl_ret_borrowed
 import numpy as np
 
+from pandas._libs import lib
+
 from pandas.core.indexes.base import Index
 from pandas.core.indexing import _iLocIndexer
 from pandas.core.series import Series
 
 
+# Helper function to hack around fact that Index casts numpy string dtype to object
+#
+# Idea is to set an attribute on a Index called _numba_data
+# that is the original data, or the object data casted to numpy string dtype,
+# with a context manager that is unset afterwards
+@contextmanager
+def set_numba_data(index: Index):
+    numba_data = index._data
+    if numba_data.dtype == object:
+        if not lib.is_string_array(numba_data):
+            raise ValueError(
+                "The numba engine only supports using string or numeric column names"
+            )
+        numba_data = numba_data.astype("U")
+    try:
+        index._numba_data = numba_data
+        yield index
+    finally:
+        del index._numba_data
+
+
 # TODO: Range index support
 # (this currently lowers OK, but does not round-trip)
 class IndexType(types.Type):
@@ -104,7 +128,8 @@ def typeof_index(val, c):
     index.
     (you should check this before this gets lowered down to numba)
     """
-    arrty = typeof_impl(val._data, c)
+    # arrty = typeof_impl(val._data, c)
+    arrty = typeof_impl(val._numba_data, c)
     assert arrty.ndim == 1
     return IndexType(arrty.dtype, arrty.layout, type(val))
 
@@ -263,18 +288,17 @@ def unbox_index(typ, obj, c):
     """
     Convert a Index object to a native structure.
 
-    If it is object dtype, we'll attempt to cast it to one of
-    numpy's string dtypes.
-    (you are responsible for validating that the Index contains only
-    strings if its object type before lowering it to numba)
+    Note: Object dtype is not allowed here
     """
-    data_obj = c.pyapi.object_getattr_string(obj, "_data")
+    # data_obj = c.pyapi.object_getattr_string(obj, "_data")
+    data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
+    # data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
     index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
     # If we see an object array, assume its been validated as only containing strings
     # We still need to do the conversion though
     index.data = c.unbox(typ.as_array, data_obj).value
     typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict))
-    # Create an empty typed dict in numba for the hasmap for indexing
+    # Create an empty typed dict in numba for the hashmap for indexing
     # equiv of numba.typed.Dict.empty(typ.dtype, types.intp)
     arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype))
     intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp))
@@ -353,7 +377,6 @@ def box_index(typ, val, c):
             # this is basically Index._simple_new(array_obj, name_obj) in python
             index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,))
             index.parent = index_obj
-            c.pyapi.print_object(index.parent)
             c.builder.store(index_obj, res)
 
             # Decrefs
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 71c60fd60ad8a..06d3e8834c821 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1156,43 +1156,17 @@ def numba_func(values, col_names, df_index):
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
+        from pandas.core._numba.extensions import set_numba_data
+
         nb_func = self.generate_numba_apply_func(
             cast(Callable, self.func), **self.engine_kwargs
         )
-        # Since numpy/numba doesn't support object array of stringswell
-        # we'll do a sketchy thing where if index._data is object
-        # we convert to string and directly set index._data to that,
-        # setting it back after we call the function
-        fixed_obj_colnames = False
-        orig_cols = self.columns.to_numpy()
-        if self.columns._data.dtype == object:
-            if not lib.is_string_array(orig_cols):
-                raise ValueError(
-                    "The numba engine only supports "
-                    "using string or numeric column names"
-                )
-            # Remember to set this back!!!
-            self.columns._data = orig_cols.astype("U")
-            fixed_obj_colnames = True
-
-        fixed_obj_index = False
-        orig_index = self.index.to_numpy()
-        if self.obj.index._data.dtype == object:
-            if not lib.is_string_array(orig_index):
-                raise ValueError(
-                    "The numba engine only supports "
-                    "using string or numeric index values"
-                )
-            # Remember to set this back!!!
-            self.obj.index._data = orig_index.astype("U")
-            fixed_obj_index = True
-        df_index = self.obj.index
-
-        res = dict(nb_func(self.values, self.columns, df_index))
-        if fixed_obj_colnames:
-            self.columns._data = orig_cols
-        if fixed_obj_index:
-            self.obj.index._data = orig_index
+        # Convert from numba dict to regular dict
+        # Our isinstance checks in the df constructor don't pass for numbas typed dict
+        with set_numba_data(self.obj.index) as index, set_numba_data(
+            self.columns
+        ) as columns:
+            res = dict(nb_func(self.values, columns, index))
         return res
 
     @property
@@ -1312,47 +1286,18 @@ def numba_func(values, col_names_index, index):
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
+        from pandas.core._numba.extensions import set_numba_data
+
         nb_func = self.generate_numba_apply_func(
             cast(Callable, self.func), **self.engine_kwargs
         )
 
-        # Since numpy/numba doesn't support object array of stringswell
-        # we'll do a sketchy thing where if index._data is object
-        # we convert to string and directly set index._data to that,
-        # setting it back after we call the function
-        fixed_obj_colnames = False
-        orig_cols = self.columns.to_numpy()
-        if self.columns._data.dtype == object:
-            if not lib.is_string_array(orig_cols):
-                raise ValueError(
-                    "The numba engine only supports "
-                    "using string or numeric column names"
-                )
-            # Remember to set this back!!!
-            self.columns._data = orig_cols.astype("U")
-            fixed_obj_colnames = True
-
-        fixed_obj_index = False
-        orig_index = self.index.to_numpy()
-        if self.obj.index._data.dtype == object:
-            if not lib.is_string_array(orig_index):
-                raise ValueError(
-                    "The numba engine only supports "
-                    "using string or numeric index values"
-                )
-            # Remember to set this back!!!
-            self.obj.index._data = orig_index.astype("U")
-            fixed_obj_index = True
-
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
-        res = dict(nb_func(self.values, self.columns, self.obj.index))
-
-        if fixed_obj_colnames:
-            self.columns._data = orig_cols
-
-        if fixed_obj_index:
-            self.obj.index._data = orig_index
+        with set_numba_data(self.obj.index) as index, set_numba_data(
+            self.columns
+        ) as columns:
+            res = dict(nb_func(self.values, columns, index))
 
         return res
 

From 60539a1fa677a4f41068f0fd8fe4f1a277169084 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 12 Oct 2023 14:28:03 -0400
Subject: [PATCH 13/18] skip if no numba

---
 pandas/tests/apply/test_numba.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index dd186bb8a57bb..9b89bfdad5b51 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -1,12 +1,16 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import (
     DataFrame,
     Index,
 )
 import pandas._testing as tm
 
+pytestmark = td.skip_if_no("numba")
+
 
 def test_numba_vs_python_noop(float_frame, apply_axis):
     func = lambda x: x

From 76538d696ae5a4ed19418b134b0ed66037e60a19 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 12 Oct 2023 14:35:50 -0400
Subject: [PATCH 14/18] simplify more

---
 pandas/core/_numba/extensions.py |  9 ---------
 pandas/core/apply.py             | 29 ++++++++---------------------
 2 files changed, 8 insertions(+), 30 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index 4e1f59c6d4e96..ebe2a752a12f7 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -266,8 +266,6 @@ def index_impl(data):
 
 # Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type
 # (regular string)
-
-
 def maybe_cast_str(x):
     # Dummy function that numba can overload
     pass
@@ -290,9 +288,7 @@ def unbox_index(typ, obj, c):
 
     Note: Object dtype is not allowed here
     """
-    # data_obj = c.pyapi.object_getattr_string(obj, "_data")
     data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
-    # data_obj = c.pyapi.object_getattr_string(obj, "_numba_data")
     index = cgutils.create_struct_proxy(typ)(c.context, c.builder)
     # If we see an object array, assume its been validated as only containing strings
     # We still need to do the conversion though
@@ -396,7 +392,6 @@ def box_series(typ, val, c):
     array_obj = c.box(typ.as_array, series.values)
     name_obj = c.box(typ.namety, series.name)
     true_obj = c.pyapi.unserialize(c.pyapi.serialize_object(True))
-    # TODO: Is borrowing none here safe?
     # This is equivalent of
     # pd.Series(data=array_obj, index=index_obj, dtype=None,
     #           name=name_obj, copy=None, fastpath=True)
@@ -424,8 +419,6 @@ def box_series(typ, val, c):
 
 # Add common series reductions (e.g. mean, sum),
 # and also add common binops (e.g. add, sub, mul, div)
-
-
 def generate_series_reduction(ser_reduction, ser_method):
     @overload_method(SeriesType, ser_reduction)
     def series_reduction(series):
@@ -497,8 +490,6 @@ def index_get_loc_impl(index, item):
 
 
 # Indexing for Series/Index
-
-
 @overload(operator.getitem)
 def series_indexing(series, item):
     if isinstance(series, SeriesType):
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 06d3e8834c821..ace37c678b639 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1,8 +1,3 @@
-# pyright: reportUnusedImport=false
-# Disabled since there's no way to do an ignore for both pyright
-# and ruff, and ruff should be sufficient
-# (The reason we need this is because the import of the numba extensions is unused
-# but is necessary to register the extensions)
 from __future__ import annotations
 
 import abc
@@ -1130,16 +1125,13 @@ def series_generator(self) -> Generator[Series, None, None]:
     def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
     ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
+        numba = import_optional_dependency("numba")
         from pandas import Series
 
-        # Dummy import just to make the extensions loaded in
-        # This isn't an entrypoint since we don't want users
-        # using Series/DF in numba code outside of apply
-        from pandas.core._numba.extensions import SeriesType  # noqa: F401
+        # Import helper from extensions to cast string object -> np strings
+        # Note: This also has the side effect of loading our numba extensions
         from pandas.core._numba.extensions import maybe_cast_str
 
-        numba = import_optional_dependency("numba")
-
         jitted_udf = numba.extending.register_jitable(func)
 
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
@@ -1156,11 +1148,11 @@ def numba_func(values, col_names, df_index):
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
-        from pandas.core._numba.extensions import set_numba_data
-
         nb_func = self.generate_numba_apply_func(
             cast(Callable, self.func), **self.engine_kwargs
         )
+        from pandas.core._numba.extensions import set_numba_data
+
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
         with set_numba_data(self.obj.index) as index, set_numba_data(
@@ -1257,15 +1249,10 @@ def series_generator(self) -> Generator[Series, None, None]:
     def generate_numba_apply_func(
         func, nogil=True, nopython=True, parallel=False
     ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
-        # Dummy import just to make the extensions loaded in
-        # This isn't an entrypoint since we don't want users
-        # using Series/DF in numba code outside of apply
+        numba = import_optional_dependency("numba")
         from pandas import Series
-        from pandas.core._numba.extensions import SeriesType  # noqa: F401
         from pandas.core._numba.extensions import maybe_cast_str
 
-        numba = import_optional_dependency("numba")
-
         jitted_udf = numba.extending.register_jitable(func)
 
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
@@ -1286,12 +1273,12 @@ def numba_func(values, col_names_index, index):
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
-        from pandas.core._numba.extensions import set_numba_data
-
         nb_func = self.generate_numba_apply_func(
             cast(Callable, self.func), **self.engine_kwargs
         )
 
+        from pandas.core._numba.extensions import set_numba_data
+
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
         with set_numba_data(self.obj.index) as index, set_numba_data(

From cca34f937e0de1ca298965679e97468a9c399657 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 12 Oct 2023 18:40:00 -0400
Subject: [PATCH 15/18] specify dtypes

---
 pandas/tests/apply/test_numba.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index 9b89bfdad5b51..5ae57f18d8467 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -44,7 +44,9 @@ def test_numba_vs_python_reductions(float_frame, reduction, apply_axis):
 @pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]])
 def test_numba_numeric_colnames(colnames):
     # Check that numeric column names lower properly and can be indxed on
-    df = DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=colnames)
+    df = DataFrame(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=colnames
+    )
     first_col = colnames[0]
     f = lambda x: x[first_col]  # Get the first column
     result = df.apply(f, engine="numba", axis=1)

From f86024fbe094a0942e4474d60309130543670687 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:57:11 -0400
Subject: [PATCH 16/18] address code review

---
 pandas/core/apply.py             |  4 ++++
 pandas/tests/apply/test_numba.py | 18 +++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index ace37c678b639..c8569fff564ba 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1134,6 +1134,8 @@ def generate_numba_apply_func(
 
         jitted_udf = numba.extending.register_jitable(func)
 
+        # Currently the parallel argument doesn't get passed through here
+        # (it's disabled) since the dicts in numba aren't thread-safe.
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
         def numba_func(values, col_names, df_index):
             results = {}
@@ -1258,6 +1260,8 @@ def generate_numba_apply_func(
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
         def numba_func(values, col_names_index, index):
             results = {}
+            # Currently the parallel argument doesn't get passed through here
+            # (it's disabled) since the dicts in numba aren't thread-safe.
             for i in range(values.shape[0]):
                 # Create the series
                 # TODO: values corrupted without the copy
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index 5ae57f18d8467..ba317b2a9fc1b 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -19,15 +19,19 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
     tm.assert_frame_equal(result, expected)
 
 
-def test_numba_vs_python_indexing(float_frame):
-    row_func = lambda x: x["A"]
-    result = float_frame.apply(row_func, engine="numba", axis=1)
-    expected = float_frame.apply(row_func, engine="python", axis=1)
+def test_numba_vs_python_indexing():
+    frame = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},
+        index=Index(["A", "B", "C"]),
+    )
+    row_func = lambda x: x["c"]
+    result = frame.apply(row_func, engine="numba", axis=1)
+    expected = frame.apply(row_func, engine="python", axis=1)
     tm.assert_series_equal(result, expected)
 
-    row_func = lambda x: x["ZqgszYBfuL"]  # This is a label in the index
-    result = float_frame.apply(row_func, engine="numba", axis=0)
-    expected = float_frame.apply(row_func, engine="python", axis=0)
+    col_func = lambda x: x["A"]
+    result = frame.apply(col_func, engine="numba", axis=0)
+    expected = frame.apply(col_func, engine="python", axis=0)
     tm.assert_series_equal(result, expected)
 
 

From a15293d5f718d411b26e9af37654e5eb4d4be784 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 18 Oct 2023 21:50:16 -0400
Subject: [PATCH 17/18] add errors for invalid columns

---
 pandas/core/apply.py             | 17 +++++++++++++++++
 pandas/tests/apply/test_numba.py | 18 ++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index c8569fff564ba..dec1d0bc3f78f 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -38,7 +38,9 @@
 from pandas.core.dtypes.cast import is_nested_object
 from pandas.core.dtypes.common import (
     is_dict_like,
+    is_extension_array_dtype,
     is_list_like,
+    is_numeric_dtype,
     is_sequence,
 )
 from pandas.core.dtypes.dtypes import (
@@ -824,6 +826,20 @@ def generate_numba_apply_func(
     def apply_with_numba(self):
         pass
 
+    def validate_values_for_numba(self):
+        # Validate column dtyps all OK
+        for colname, dtype in self.obj.dtypes.items():
+            if not is_numeric_dtype(dtype):
+                raise ValueError(
+                    f"Column {colname} must have a numeric dtype."
+                    f"Found '{dtype}' instead"
+                )
+            if is_extension_array_dtype(dtype):
+                raise ValueError(
+                    f"Column {colname} is backed by an extension array,"
+                    f"which is not supported by the numba engine."
+                )
+
     @abc.abstractmethod
     def wrap_results_for_axis(
         self, results: ResType, res_index: Index
@@ -1078,6 +1094,7 @@ def apply_series_numba(self):
             raise NotImplementedError(
                 "The index/columns must be unique when raw=False and engine='numba'"
             )
+        self.validate_values_for_numba()
         results = self.apply_with_numba()
         return results, self.result_index
 
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index ba317b2a9fc1b..5537bca583eba 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -75,3 +75,21 @@ def test_numba_nonunique_unsupported(apply_axis):
         match="The index/columns must be unique when raw=False and engine='numba'",
     ):
         df.apply(f, engine="numba", axis=apply_axis)
+
+
+def test_numba_unsupported_dtypes(apply_axis):
+    f = lambda x: x
+    df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]})
+    df["c"] = df["c"].astype("double[pyarrow]")
+
+    with pytest.raises(
+        ValueError, match="Column b must have a numeric dtype. Found 'object' instead"
+    ):
+        df.apply(f, engine="numba", axis=apply_axis)
+
+    with pytest.raises(
+        ValueError,
+        match="Column c is backed by an extension array,"
+        "which is not supported by the numba engine.",
+    ):
+        df["c"].to_frame().apply(f, engine="numba", axis=apply_axis)

From 8fe5d8964a59df5b2b69298ece8a009ffba96455 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 19 Oct 2023 12:11:48 -0400
Subject: [PATCH 18/18] adjust message

---
 pandas/core/apply.py             | 4 ++--
 pandas/tests/apply/test_numba.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index dec1d0bc3f78f..3b79882d3c762 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -831,12 +831,12 @@ def validate_values_for_numba(self):
         for colname, dtype in self.obj.dtypes.items():
             if not is_numeric_dtype(dtype):
                 raise ValueError(
-                    f"Column {colname} must have a numeric dtype."
+                    f"Column {colname} must have a numeric dtype. "
                     f"Found '{dtype}' instead"
                 )
             if is_extension_array_dtype(dtype):
                 raise ValueError(
-                    f"Column {colname} is backed by an extension array,"
+                    f"Column {colname} is backed by an extension array, "
                     f"which is not supported by the numba engine."
                 )
 
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index 5537bca583eba..7e1e44d2119f9 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -89,7 +89,7 @@ def test_numba_unsupported_dtypes(apply_axis):
 
     with pytest.raises(
         ValueError,
-        match="Column c is backed by an extension array,"
+        match="Column c is backed by an extension array, "
         "which is not supported by the numba engine.",
     ):
         df["c"].to_frame().apply(f, engine="numba", axis=apply_axis)