From 1fa802c32044a0551e1b28e7ae3f902fc0f8605f Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 11 Sep 2023 21:04:08 -0400 Subject: [PATCH 01/18] ENH: Add numba engine to df.apply --- pandas/core/_numba/extensions.py | 448 +++++++++++++++++++++++++ pandas/core/apply.py | 117 ++++++- pandas/tests/apply/test_frame_apply.py | 6 +- 3 files changed, 567 insertions(+), 4 deletions(-) create mode 100644 pandas/core/_numba/extensions.py diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py new file mode 100644 index 0000000000000..1627b4fc3987f --- /dev/null +++ b/pandas/core/_numba/extensions.py @@ -0,0 +1,448 @@ +""" +Utility classes/functions to let numba recognize +pandas Index/Series/DataFrame + +Mostly vendored from https://github.com/numba/numba/blob/main/numba/tests/pdlike_usecase.py +""" + +from __future__ import annotations + +import operator + +import numba +from numba.core import ( + cgutils, + types, +) +from numba.core.datamodel import models +from numba.core.extending import ( + NativeValue, + box, + lower_builtin, + make_attribute_wrapper, + overload, + overload_attribute, + overload_method, + register_model, + type_callable, + typeof_impl, + unbox, +) +from numba.core.imputils import impl_ret_borrowed +import numpy as np + +from pandas.core.indexes.base import Index +from pandas.core.indexing import _iLocIndexer +from pandas.core.series import Series + + +# TODO: Range index support +# (not passing an index to series constructor doesn't work) +class IndexType(types.Buffer): + """ + The type class for Index objects. + """ + + def __init__(self, dtype, layout, pyclass) -> None: + self.pyclass = pyclass + super().__init__(dtype, 1, layout) + + @property + def key(self): + return self.pyclass, self.dtype, self.layout + + @property + def as_array(self): + return types.Array(self.dtype, 1, self.layout) + + def copy(self, dtype=None, ndim: int = 1, layout=None): + assert ndim == 1 + if dtype is None: + dtype = self.dtype + layout = layout or self.layout + return type(self)(dtype, layout, self.pyclass) + + +class SeriesType(types.ArrayCompatible): + """ + The type class for Series objects. + """ + + def __init__(self, dtype, index, namety) -> None: + assert isinstance(index, IndexType) + self.dtype = dtype + self.index = index + self.values = types.Array(self.dtype, 1, "C") + self.namety = namety + name = f"series({dtype}, {index}, {namety})" + super().__init__(name) + + @property + def key(self): + return self.dtype, self.index, self.namety + + @property + def as_array(self): + return self.values + + def copy(self, dtype=None, ndim: int = 1, layout: str = "C"): + assert ndim == 1 + assert layout == "C" + if dtype is None: + dtype = self.dtype + return type(self)(dtype, self.index, self.namety) + + +@typeof_impl.register(Index) +def typeof_index(val, c): + """ + This will assume that only strings are in object dtype + index. + (you should check this before this gets lowered down to numba) + """ + arrty = typeof_impl(val._data, c) + assert arrty.ndim == 1 + return IndexType(arrty.dtype, arrty.layout, type(val)) + + +@typeof_impl.register(Series) +def typeof_series(val, c): + index = typeof_impl(val.index, c) + arrty = typeof_impl(val.values, c) + namety = typeof_impl(val.name, c) + assert arrty.ndim == 1 + assert arrty.layout == "C" + return SeriesType(arrty.dtype, index, namety) + + +@type_callable(Series) +def type_series_constructor(context): + def typer(data, index, name=None): + if isinstance(index, IndexType) and isinstance(data, types.Array): + # assert data.layout == "C" + assert data.ndim == 1 + if name is None: + name = types.intp + return SeriesType(data.dtype, index, name) + + return typer + + +@type_callable(Index) +def type_index_constructor(context): + def typer(data, hashmap=None): + if isinstance(data, types.Array): + assert data.layout == "C" + assert data.ndim == 1 + assert hashmap is None or isinstance(hashmap, types.DictType) + return IndexType(data.dtype, layout=data.layout, pyclass=Index) + + return typer + + +# Backend extensions for Index and Series and Frame +@register_model(IndexType) +class IndexModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + members = [ + ("data", fe_type.as_array), + # This is an attempt to emulate our hashtable code with a numba + # typed dict + # It maps from values in the index to their integer positions in the array + ("hashmap", types.DictType(fe_type.dtype, types.intp)), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +@register_model(SeriesType) +class SeriesModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + members = [ + ("index", fe_type.index), + ("values", fe_type.as_array), + ("name", fe_type.namety), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(IndexType, "data", "_data") +make_attribute_wrapper(IndexType, "hashmap", "hashmap") + +make_attribute_wrapper(SeriesType, "index", "index") +make_attribute_wrapper(SeriesType, "values", "values") +make_attribute_wrapper(SeriesType, "name", "name") + + +@lower_builtin(Series, types.Array, IndexType) +def pdseries_constructor(context, builder, sig, args): + data, index = args + series = cgutils.create_struct_proxy(sig.return_type)(context, builder) + series.index = index + series.values = data + series.name = context.get_constant(types.intp, 0) + return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) + + +@lower_builtin(Series, types.Array, IndexType, types.intp) +@lower_builtin(Series, types.Array, IndexType, types.float64) +@lower_builtin(Series, types.Array, IndexType, types.unicode_type) +def pdseries_constructor(context, builder, sig, args): + data, index, name = args + series = cgutils.create_struct_proxy(sig.return_type)(context, builder) + series.index = index + series.values = data + series.name = name + return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) + + +@lower_builtin(Index, types.Array, types.DictType) +def index_constructor_2arg(context, builder, sig, args): + (data, hashmap) = args + index = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + index.data = data + index.hashmap = hashmap + return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) + + +@lower_builtin(Index, types.Array) +def index_constructor_1arg(context, builder, sig, args): + from numba.typed import Dict + + key_type = sig.return_type.dtype + value_type = types.intp + + def index_impl(data): + return Index(data, Dict.empty(key_type, value_type)) + + return context.compile_internal(builder, index_impl, sig, args) + + +@unbox(IndexType) +def unbox_index(typ, obj, c): + """ + Convert a Index object to a native structure. + + If it is object dtype, we'll attempt to cast it to one of + numpy's string dtypes. + (you are responsible for validating that the Index contains only + strings if its object type before lowering it to numba) + """ + data_obj = c.pyapi.object_getattr_string(obj, "_data") + index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + index.data = c.unbox(typ.as_array, data_obj).value + typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict)) + # Create an empty typed dict in numba + # equiv of numba.typed.Dict.empty(typ.dtype, types.intp) + arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype)) + intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp)) + hashmap_obj = c.pyapi.call_method( + typed_dict_obj, "empty", (arr_type_obj, intp_type_obj) + ) + index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value + + # Decrefs + c.pyapi.decref(data_obj) + c.pyapi.decref(arr_type_obj) + c.pyapi.decref(intp_type_obj) + c.pyapi.decref(typed_dict_obj) + + return NativeValue(index._getvalue()) + + +@unbox(SeriesType) +def unbox_series(typ, obj, c): + """ + Convert a Series object to a native structure. + """ + index_obj = c.pyapi.object_getattr_string(obj, "index") + values_obj = c.pyapi.object_getattr_string(obj, "values") + name_obj = c.pyapi.object_getattr_string(obj, "name") + + series = cgutils.create_struct_proxy(typ)(c.context, c.builder) + series.index = c.unbox(typ.index, index_obj).value + series.values = c.unbox(typ.values, values_obj).value + series.name = c.unbox(typ.namety, name_obj).value + + # Decrefs + c.pyapi.decref(index_obj) + c.pyapi.decref(values_obj) + c.pyapi.decref(name_obj) + + return NativeValue(series._getvalue()) + + +@box(IndexType) +def box_index(typ, val, c): + """ + Convert a native index structure to a Index object. + + If our native index is of a numpy string dtype, we'll cast it to + object. + """ + # First build a Numpy array object, then wrap it in a Index + index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) + + # TODO: preserve the original class for the index + # Also need preserve the name of the Index + + # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass)) + class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index)) + array_obj = c.box(typ.as_array, index.data) + # this is basically Index._simple_new(array_obj, name_obj) in python + index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,)) + + # Decrefs + c.pyapi.decref(class_obj) + c.pyapi.decref(array_obj) + return index_obj + + +@box(SeriesType) +def box_series(typ, val, c): + """ + Convert a native series structure to a Series object. + """ + series = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) + class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Series)) + index_obj = c.box(typ.index, series.index) + array_obj = c.box(typ.as_array, series.values) + name_obj = c.box(typ.namety, series.name) + true_obj = c.pyapi.unserialize(c.pyapi.serialize_object(True)) + # TODO: Is borrowing none here safe? + # This is equivalent of pd.Series(data=array_obj, index=index_obj, dtype=None, name=name_obj, copy=None, fastpath=True) + series_obj = c.pyapi.call_function_objargs( + class_obj, + ( + array_obj, + index_obj, + c.pyapi.borrow_none(), + name_obj, + c.pyapi.borrow_none(), + true_obj, + ), + ) + + # Decrefs + c.pyapi.decref(class_obj) + c.pyapi.decref(index_obj) + c.pyapi.decref(array_obj) + c.pyapi.decref(name_obj) + c.pyapi.decref(true_obj) + + return series_obj + + +# Add common series reductions + + +def generate_series_reduction(reduction, reduction_method): + @overload_method(SeriesType, reduction) + def series_reduction(series): + def series_reduction_impl(series): + return reduction_method(series.values) + + return series_reduction_impl + + return series_reduction + + +series_reductions = [ + ("sum", np.sum), + ("mean", np.mean), + ("std", np.std), + ("var", np.var), +] +for reduction, reduction_method in series_reductions: + generate_series_reduction(reduction, reduction_method) + + +# get_loc on Index +@overload_method(IndexType, "get_loc") +def index_get_loc(index, item): + def index_get_loc_impl(index, item): + # Initialize the hash table if not initalized + if len(index.hashmap) == 0: + for i, val in enumerate(index._data): + index.hashmap[val] = i + return index.hashmap[item] + + return index_get_loc_impl + + +# Indexing for Series + + +@overload(operator.getitem) +def series_indexing(series, item): + if isinstance(series, SeriesType): + + def series_getitem(series, item): + loc = series.index.get_loc(item) + return series.iloc[loc] + + return series_getitem + + +class IlocType(types.Type): + def __init__(self, obj_type) -> None: + self.obj_type = obj_type + name = f"iLocIndexer({obj_type})" + super().__init__(name=name) + + @property + def key(self): + return self.obj_type + + +@typeof_impl.register(_iLocIndexer) +def typeof_iloc(val, c): + objtype = typeof_impl(val.obj, c) + return IlocType(objtype) + + +@type_callable(_iLocIndexer) +def type_iloc_constructor(context): + def typer(obj): + if isinstance(obj, SeriesType): + return IlocType(obj) + + return typer + + +@lower_builtin(_iLocIndexer, SeriesType) +def iloc_constructor(context, builder, sig, args): + (obj,) = args + iloc_indexer = cgutils.create_struct_proxy(sig.return_type)(context, builder) + iloc_indexer.obj = obj + return impl_ret_borrowed( + context, builder, sig.return_type, iloc_indexer._getvalue() + ) + + +@register_model(IlocType) +class ILocModel(models.StructModel): + def __init__(self, dmm, fe_type) -> None: + members = [("obj", fe_type.obj_type)] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(IlocType, "obj", "obj") + + +@overload_attribute(SeriesType, "iloc") +def series_iloc(series): + def get(series): + return _iLocIndexer(series) + + return get + + +@overload(operator.getitem) +def iloc_getitem(iloc_indexer, i): + if isinstance(iloc_indexer, IlocType): + + def getitem_impl(iloc_indexer, i): + return iloc_indexer.obj.values[i] + + return getitem_impl diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4d6dd8f4fd577..a7264940be16f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -2,6 +2,7 @@ import abc from collections import defaultdict +import functools from functools import partial import inspect from typing import ( @@ -29,6 +30,7 @@ NDFrameT, npt, ) +from pandas.compat._optional import import_optional_dependency from pandas.errors import SpecificationError from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level @@ -777,6 +779,15 @@ def result_columns(self) -> Index: def series_generator(self) -> Generator[Series, None, None]: pass + @property + @abc.abstractmethod + def generate_numba_apply_func(self) -> Callable[[npt.NDarray], dict[int, Any]]: + pass + + @abc.abstractmethod + def apply_with_numba(self): + pass + @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: Index @@ -956,7 +967,13 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame: return result def apply_standard(self): - results, res_index = self.apply_series_generator() + if self.engine == "python": + results, res_index = self.apply_series_generator() + else: + results, res_index = self.apply_series_numba() + + # print(results) + # print(res_index) # wrap results return self.wrap_results(results, res_index) @@ -980,6 +997,10 @@ def apply_series_generator(self) -> tuple[ResType, Index]: return results, res_index + def apply_series_numba(self): + results = self.apply_with_numba() + return results, self.result_index + def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series: from pandas import Series @@ -1019,6 +1040,45 @@ class FrameRowApply(FrameApply): def series_generator(self) -> Generator[Series, None, None]: return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) + @staticmethod + @functools.cache + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDarray], dict[int, Any]]: + from pandas import Series + + numba = import_optional_dependency("numba") + + jitted_udf = numba.extending.register_jitable(func) + + @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + def numba_func(values, col_names, df_index): + results = {} + for j in range(values.shape[1]): + # Create the series + # TODO: No need for the str call? + # Need to adapt types to accept UnicodeCharSeq in Series constructor + ser = Series(values[:, j], index=df_index, name=str(col_names[j])) + + results[j] = jitted_udf(ser) + + return results + + return numba_func + + def apply_with_numba(self) -> dict[int, Any]: + nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs) + col_names_values = self.columns._data + if col_names_values.dtype == object: + if not lib.is_string_array(col_names_values): + raise ValueError( + "The numba engine only supports using string or numeric column names" + ) + col_names_values = col_names_values.astype("U") + df_index = self.obj.index + + return nb_func(self.values, col_names_values, df_index) + @property def result_index(self) -> Index: return self.columns @@ -1102,6 +1162,61 @@ def series_generator(self) -> Generator[Series, None, None]: object.__setattr__(ser, "_name", name) yield ser + @staticmethod + @functools.cache + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]: + # Unused import just to register the extensions + + from pandas import ( + Index, + Series, + ) + + numba = import_optional_dependency("numba") + + jitted_udf = numba.extending.register_jitable(func) + + @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) + def numba_func(values, col_names, index_values): + results = {} + col_names_index = Index(col_names) + for i in range(values.shape[0]): + # Create the series + # TODO: values corrupted without the copy + ser = Series( + values[i].copy(), index=col_names_index, name=index_values[i] + ) + + results[i] = jitted_udf(ser) + + return results + + return numba_func + + def apply_with_numba(self) -> dict[int, Any]: + nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs) + + # Unpack the index and repack it inside the jitted numba function + # This is since if we have object dtype and strings we want to convert + # to a numpy string dtype (and our regular index doesn't support numpy string dtypes) + col_names_values = self.columns._data + if col_names_values.dtype == object: + if not lib.is_string_array(col_names_values): + raise ValueError( + "The numba engine only supports using string or numeric column names" + ) + col_names_values = col_names_values.astype("U") + index_values = self.obj.index.values + + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + result_nb_dict = nb_func(self.values, col_names_values, index_values) + result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values() + return dict(zip(result_keys, result_values)) + # return dict(nb_func(self.values, col_names_values, index_values)) + @property def result_index(self) -> Index: return self.index diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3a3f73a68374b..d541886cecb09 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -94,14 +94,14 @@ def test_apply_empty(func): assert result.empty -def test_apply_float_frame(float_frame): +def test_apply_float_frame(float_frame, engine): no_rows = float_frame[:0] - result = no_rows.apply(lambda x: x.mean()) + result = no_rows.apply(lambda x: x.mean(), engine=engine) expected = Series(np.nan, index=float_frame.columns) tm.assert_series_equal(result, expected) no_cols = float_frame.loc[:, []] - result = no_cols.apply(lambda x: x.mean(), axis=1) + result = no_cols.apply(lambda x: x.mean(), axis=1, engine=engine) expected = Series(np.nan, index=float_frame.index) tm.assert_series_equal(result, expected) From 0ac544d750daae64377940ff6d95c5a1909cdcc0 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 13 Sep 2023 22:45:47 -0400 Subject: [PATCH 02/18] complete? --- pandas/core/_numba/extensions.py | 99 ++++++++++++++++++++++++++------ pandas/core/apply.py | 50 ++++++++-------- 2 files changed, 108 insertions(+), 41 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index 1627b4fc3987f..d80f93e8b75b2 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -63,7 +63,7 @@ def copy(self, dtype=None, ndim: int = 1, layout=None): return type(self)(dtype, layout, self.pyclass) -class SeriesType(types.ArrayCompatible): +class SeriesType(types.Type): """ The type class for Series objects. """ @@ -150,6 +150,10 @@ def __init__(self, dmm, fe_type) -> None: # typed dict # It maps from values in the index to their integer positions in the array ("hashmap", types.DictType(fe_type.dtype, types.intp)), + # Pointer to the Index object this was created from, or that it + # boxes to + # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1 + ("parent", types.pyobject) ] models.StructModel.__init__(self, dmm, fe_type, members) @@ -195,8 +199,20 @@ def pdseries_constructor(context, builder, sig, args): return impl_ret_borrowed(context, builder, sig.return_type, series._getvalue()) -@lower_builtin(Index, types.Array, types.DictType) +@lower_builtin(Index, types.Array, types.DictType, types.pyobject) def index_constructor_2arg(context, builder, sig, args): + (data, hashmap, parent) = args + index = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + index.data = data + index.hashmap = hashmap + index.parent = parent + return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) + +@lower_builtin(Index, types.Array, types.DictType) +def index_constructor_2arg_parent(context, builder, sig, args): + # Basically same as index_constructor_1arg, but also lets you specify the + # parent object (data, hashmap) = args index = cgutils.create_struct_proxy(sig.return_type)(context, builder) @@ -230,9 +246,11 @@ def unbox_index(typ, obj, c): """ data_obj = c.pyapi.object_getattr_string(obj, "_data") index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + # If we see an object array, assume its been validated as only containing strings + # We still need to do the conversion though index.data = c.unbox(typ.as_array, data_obj).value typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict)) - # Create an empty typed dict in numba + # Create an empty typed dict in numba for the hasmap for indexing # equiv of numba.typed.Dict.empty(typ.dtype, types.intp) arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype)) intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp)) @@ -240,6 +258,8 @@ def unbox_index(typ, obj, c): typed_dict_obj, "empty", (arr_type_obj, intp_type_obj) ) index.hashmap = c.unbox(types.DictType(typ.dtype, types.intp), hashmap_obj).value + # Set the parent for speedy boxing. + index.parent = obj # Decrefs c.pyapi.decref(data_obj) @@ -283,19 +303,36 @@ def box_index(typ, val, c): # First build a Numpy array object, then wrap it in a Index index = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val) - # TODO: preserve the original class for the index - # Also need preserve the name of the Index - - # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass)) - class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index)) - array_obj = c.box(typ.as_array, index.data) - # this is basically Index._simple_new(array_obj, name_obj) in python - index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,)) - - # Decrefs - c.pyapi.decref(class_obj) - c.pyapi.decref(array_obj) - return index_obj + res = cgutils.alloca_once_value(c.builder, index.parent) + + # Does parent exist? + # (it means already boxed once, or Index same as original df.index or df.columns) + # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17 + with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (has_parent, otherwise): + with has_parent: + c.pyapi.incref(index.parent) + with otherwise: + # TODO: preserve the original class for the index + # Also need preserve the name of the Index + # class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.pyclass)) + class_obj = c.pyapi.unserialize(c.pyapi.serialize_object(Index)) + array_obj = c.box(typ.as_array, index.data) + if isinstance(typ.dtype, types.UnicodeCharSeq): + # We converted to numpy string dtype, convert back + # to object since _simple_new won't do that for uss + object_str_obj = c.pyapi.unserialize(c.pyapi.serialize_object("object")) + array_obj = c.pyapi.call_method(array_obj, "astype", (object_str_obj,)) + c.pyapi.decref(object_str_obj) + # this is basically Index._simple_new(array_obj, name_obj) in python + index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,)) + index.parent = index_obj + c.pyapi.print_object(index.parent) + c.builder.store(index_obj, res) + + # Decrefs + c.pyapi.decref(class_obj) + c.pyapi.decref(array_obj) + return c.builder.load(res) @box(SeriesType) @@ -333,7 +370,8 @@ def box_series(typ, val, c): return series_obj -# Add common series reductions +# Add common series reductions (e.g. mean, sum), +# and also add common binops (e.g. add, sub, mul, div) def generate_series_reduction(reduction, reduction_method): @@ -347,6 +385,23 @@ def series_reduction_impl(series): return series_reduction +def generate_series_binop(binop): + @overload(binop) + def series_binop(series1, value): + if isinstance(series1, SeriesType): + if isinstance(value, SeriesType): + def series_binop_impl(series1, series2): + # TODO: Check index matching? + return Series(binop(series1.values, series2.values), series1.index, series1.name) + return series_binop_impl + else: + def series_binop_impl(series1, value): + return Series(binop(series1.values, value), series1.index, series1.name) + return series_binop_impl + + return series_binop + + series_reductions = [ ("sum", np.sum), ("mean", np.mean), @@ -356,6 +411,16 @@ def series_reduction_impl(series): for reduction, reduction_method in series_reductions: generate_series_reduction(reduction, reduction_method) +series_binops = [ + operator.add, + operator.sub, + operator.mul, + operator.truediv +] + +for binop in series_binops: + generate_series_binop(binop) + # get_loc on Index @overload_method(IndexType, "get_loc") diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b1cc6fa79b819..12e879f053477 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -771,7 +771,7 @@ def __init__( if by_row is not False and by_row != "compat": raise ValueError(f"by_row={by_row} not allowed") self.engine = engine - self.engine_kwargs = engine_kwargs + self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs super().__init__( obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs ) @@ -826,11 +826,6 @@ def values(self): def apply(self) -> DataFrame | Series: """compute the results""" - if self.engine == "numba" and not self.raw: - raise ValueError( - "The numba engine in DataFrame.apply can only be used when raw=True" - ) - # dispatch to handle list-like or dict-like if is_list_like(self.func): return self.apply_list_or_dict_like() @@ -1009,9 +1004,6 @@ def apply_standard(self): else: results, res_index = self.apply_series_numba() - # print(results) - # print(res_index) - # wrap results return self.wrap_results(results, res_index) @@ -1083,6 +1075,10 @@ def generate_numba_apply_func( func, nogil=True, nopython=True, parallel=False ) -> Callable[[npt.NDarray], dict[int, Any]]: from pandas import Series + # Dummy import just to make the extensions loaded in + # This isn't an entrypoint since we don't want users + # using Series/DF in numba code outside of apply + from pandas.core._numba.extensions import SeriesType numba = import_optional_dependency("numba") @@ -1096,9 +1092,7 @@ def numba_func(values, col_names, df_index): # TODO: No need for the str call? # Need to adapt types to accept UnicodeCharSeq in Series constructor ser = Series(values[:, j], index=df_index, name=str(col_names[j])) - results[j] = jitted_udf(ser) - return results return numba_func @@ -1114,7 +1108,7 @@ def apply_with_numba(self) -> dict[int, Any]: col_names_values = col_names_values.astype("U") df_index = self.obj.index - return nb_func(self.values, col_names_values, df_index) + return dict(nb_func(self.values, col_names_values, df_index)) @property def result_index(self) -> Index: @@ -1204,7 +1198,10 @@ def series_generator(self) -> Generator[Series, None, None]: def generate_numba_apply_func( func, nogil=True, nopython=True, parallel=False ) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]: - # Unused import just to register the extensions + # Dummy import just to make the extensions loaded in + # This isn't an entrypoint since we don't want users + # using Series/DF in numba code outside of apply + from pandas.core._numba.extensions import SeriesType from pandas import ( Index, @@ -1216,16 +1213,15 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names, index_values): + def numba_func(values, col_names_index, index_values): results = {} - col_names_index = Index(col_names) + #col_names_index = Index(col_names) for i in range(values.shape[0]): # Create the series # TODO: values corrupted without the copy ser = Series( values[i].copy(), index=col_names_index, name=index_values[i] ) - results[i] = jitted_udf(ser) return results @@ -1235,22 +1231,28 @@ def numba_func(values, col_names, index_values): def apply_with_numba(self) -> dict[int, Any]: nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs) - # Unpack the index and repack it inside the jitted numba function - # This is since if we have object dtype and strings we want to convert - # to a numpy string dtype (and our regular index doesn't support numpy string dtypes) - col_names_values = self.columns._data - if col_names_values.dtype == object: - if not lib.is_string_array(col_names_values): + # Since numpy/numba doesn't support object array of stringswell + # we'll do a sketchy thing where if index._data is object + # we convert to string and directly set index._data to that, + # setting it back after we call the function + fixed_obj_dtype = False + orig_data = self.columns._data + if self.columns._data.dtype == object: + if not lib.is_string_array(self.columns._data): raise ValueError( "The numba engine only supports using string or numeric column names" ) - col_names_values = col_names_values.astype("U") + # Remember to set this back!!! + self.columns._data = self.columns._data.astype("U") + fixed_obj_dtype = True index_values = self.obj.index.values # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict - result_nb_dict = nb_func(self.values, col_names_values, index_values) + result_nb_dict = nb_func(self.values, self.columns, index_values) result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values() + if fixed_obj_dtype: + self.columns._data = orig_data return dict(zip(result_keys, result_values)) # return dict(nb_func(self.values, col_names_values, index_values)) From 31b9e20d42213875acf003d5943026f626fcfdca Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 19 Sep 2023 18:49:14 -0400 Subject: [PATCH 03/18] wip: pass tests --- pandas/core/_numba/extensions.py | 29 ++++--- pandas/core/apply.py | 32 ++++++-- pandas/tests/apply/test_frame_apply.py | 107 +++++++++++++++---------- 3 files changed, 107 insertions(+), 61 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index d80f93e8b75b2..a23d63b6cdec2 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -153,7 +153,7 @@ def __init__(self, dmm, fe_type) -> None: # Pointer to the Index object this was created from, or that it # boxes to # https://numba.discourse.group/t/qst-how-to-cache-the-boxing-of-an-object/2128/2?u=lithomas1 - ("parent", types.pyobject) + ("parent", types.pyobject), ] models.StructModel.__init__(self, dmm, fe_type, members) @@ -209,6 +209,7 @@ def index_constructor_2arg(context, builder, sig, args): index.parent = parent return impl_ret_borrowed(context, builder, sig.return_type, index._getvalue()) + @lower_builtin(Index, types.Array, types.DictType) def index_constructor_2arg_parent(context, builder, sig, args): # Basically same as index_constructor_1arg, but also lets you specify the @@ -308,7 +309,10 @@ def box_index(typ, val, c): # Does parent exist? # (it means already boxed once, or Index same as original df.index or df.columns) # xref https://github.com/numba/numba/blob/596e8a55334cc46854e3192766e643767bd7c934/numba/core/boxing.py#L593C17-L593C17 - with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as (has_parent, otherwise): + with c.builder.if_else(cgutils.is_not_null(c.builder, index.parent)) as ( + has_parent, + otherwise, + ): with has_parent: c.pyapi.incref(index.parent) with otherwise: @@ -390,13 +394,23 @@ def generate_series_binop(binop): def series_binop(series1, value): if isinstance(series1, SeriesType): if isinstance(value, SeriesType): + def series_binop_impl(series1, series2): # TODO: Check index matching? - return Series(binop(series1.values, series2.values), series1.index, series1.name) + return Series( + binop(series1.values, series2.values), + series1.index, + series1.name, + ) + return series_binop_impl else: + def series_binop_impl(series1, value): - return Series(binop(series1.values, value), series1.index, series1.name) + return Series( + binop(series1.values, value), series1.index, series1.name + ) + return series_binop_impl return series_binop @@ -411,12 +425,7 @@ def series_binop_impl(series1, value): for reduction, reduction_method in series_reductions: generate_series_reduction(reduction, reduction_method) -series_binops = [ - operator.add, - operator.sub, - operator.mul, - operator.truediv -] +series_binops = [operator.add, operator.sub, operator.mul, operator.truediv] for binop in series_binops: generate_series_binop(binop) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 12e879f053477..725d7bf430b5d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -603,6 +603,12 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: result: Series, DataFrame, or None Result when self.func is a list-like or dict-like, None otherwise. """ + + if self.engine == "numba": + raise NotImplementedError( + "The 'numba' engine doesn't support list-like/dict likes of callables yet." + ) + if self.axis == 1 and isinstance(self.obj, ABCDataFrame): return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T @@ -828,6 +834,10 @@ def apply(self) -> DataFrame | Series: # dispatch to handle list-like or dict-like if is_list_like(self.func): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support lists of callables yet" + ) return self.apply_list_or_dict_like() # all empty @@ -836,10 +846,18 @@ def apply(self) -> DataFrame | Series: # string dispatch if isinstance(self.func, str): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support using a string as the callable function" + ) return self.apply_str() # ufunc elif isinstance(self.func, np.ufunc): + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support using a numpy ufunc as the callable function" + ) with np.errstate(all="ignore"): results = self.obj._mgr.apply("apply", func=self.func) # _constructor will retain self.index and self.columns @@ -847,6 +865,10 @@ def apply(self) -> DataFrame | Series: # broadcasting if self.result_type == "broadcast": + if self.engine == "numba": + raise NotImplementedError( + "the 'numba' engine doesn't support result_type='broadcast'" + ) return self.apply_broadcast(self.obj) # one axis empty @@ -1075,10 +1097,10 @@ def generate_numba_apply_func( func, nogil=True, nopython=True, parallel=False ) -> Callable[[npt.NDarray], dict[int, Any]]: from pandas import Series + # Dummy import just to make the extensions loaded in # This isn't an entrypoint since we don't want users # using Series/DF in numba code outside of apply - from pandas.core._numba.extensions import SeriesType numba = import_optional_dependency("numba") @@ -1201,12 +1223,8 @@ def generate_numba_apply_func( # Dummy import just to make the extensions loaded in # This isn't an entrypoint since we don't want users # using Series/DF in numba code outside of apply - from pandas.core._numba.extensions import SeriesType - from pandas import ( - Index, - Series, - ) + from pandas import Series numba = import_optional_dependency("numba") @@ -1215,7 +1233,7 @@ def generate_numba_apply_func( @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) def numba_func(values, col_names_index, index_values): results = {} - #col_names_index = Index(col_names) + # col_names_index = Index(col_names) for i in range(values.shape[0]): # Create the series # TODO: values corrupted without the copy diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index c62cb33c60d94..9d6e585755031 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -25,28 +25,31 @@ def engine(request): return request.param -def test_apply(float_frame): +def test_apply(float_frame, engine, request): + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet") + request.node.add_marker(mark) with np.errstate(all="ignore"): # ufunc result = np.sqrt(float_frame["A"]) - expected = float_frame.apply(np.sqrt)["A"] + expected = float_frame.apply(np.sqrt, engine=engine)["A"] tm.assert_series_equal(result, expected) # aggregator - result = float_frame.apply(np.mean)["A"] + result = float_frame.apply(np.mean, engine=engine)["A"] expected = np.mean(float_frame["A"]) assert result == expected d = float_frame.index[0] - result = float_frame.apply(np.mean, axis=1) + result = float_frame.apply(np.mean, axis=1, engine=engine) expected = np.mean(float_frame.xs(d)) assert result[d] == expected assert result.index is float_frame.index @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_args(float_frame, axis): - result = float_frame.apply(lambda x, y: x + y, axis, args=(1,)) +def test_apply_args(float_frame, axis, engine): + result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), engine=engine) expected = float_frame + 1 tm.assert_frame_equal(result, expected) @@ -93,11 +96,11 @@ def test_apply_mixed_datetimelike(): @pytest.mark.parametrize("func", [np.sqrt, np.mean]) -def test_apply_empty(func): +def test_apply_empty(func, engine=engine): # empty empty_frame = DataFrame() - result = empty_frame.apply(func) + result = empty_frame.apply(func, engine=engine) assert result.empty @@ -113,10 +116,10 @@ def test_apply_float_frame(float_frame, engine): tm.assert_series_equal(result, expected) -def test_apply_empty_except_index(): +def test_apply_empty_except_index(engine): # GH 2476 expected = DataFrame(index=["a"]) - result = expected.apply(lambda x: x["a"], axis=1) + result = expected.apply(lambda x: x["a"], axis=1, engine=engine) tm.assert_frame_equal(result, expected) @@ -320,12 +323,6 @@ def test_apply_empty_infer_type(ax, func, raw, axis, engine, request): test_res = func(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) - if engine == "numba" and raw is False: - mark = pytest.mark.xfail( - reason="numba engine only supports raw=True at the moment" - ) - request.node.add_marker(mark) - result = df.apply(func, axis=axis, engine=engine, raw=raw) if is_reduction: agg_axis = df._get_agg_axis(axis) @@ -980,9 +977,12 @@ def test_result_type_shorter_list(int_frame_const_col): tm.assert_frame_equal(result, expected) -def test_result_type_broadcast(int_frame_const_col): +def test_result_type_broadcast(int_frame_const_col, request): # result_type should be consistent no matter which # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine doesn't support list return") + request.node.add_marker(mark) df = int_frame_const_col # broadcast result result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") @@ -990,35 +990,54 @@ def test_result_type_broadcast(int_frame_const_col): tm.assert_frame_equal(result, expected) -def test_result_type_broadcast_series_func(int_frame_const_col): +def test_result_type_broadcast_series_func(int_frame_const_col, engine, request): # result_type should be consistent no matter which # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba Series constructor only support ndarrays not list data" + ) + request.node.add_marker(mark) df = int_frame_const_col columns = ["other", "col", "names"] result = df.apply( - lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" + lambda x: Series([1, 2, 3], index=columns), + axis=1, + result_type="broadcast", + engine=engine, ) expected = df.copy() tm.assert_frame_equal(result, expected) -def test_result_type_series_result(int_frame_const_col): +def test_result_type_series_result(int_frame_const_col, engine, request): # result_type should be consistent no matter which # path we take in the code + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba Series constructor only support ndarrays not list data" + ) + request.node.add_marker(mark) df = int_frame_const_col # series result - result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1) + result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1, engine=engine) expected = df.copy() tm.assert_frame_equal(result, expected) -def test_result_type_series_result_other_index(int_frame_const_col): +def test_result_type_series_result_other_index(int_frame_const_col, engine, request): # result_type should be consistent no matter which # path we take in the code + + if engine == "numba": + mark = pytest.mark.xfail( + reason="no support in numba Series constructor for list of columns" + ) + request.node.add_marker(mark) df = int_frame_const_col # series result with other index columns = ["other", "col", "names"] - result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) + result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1, engine=engine) expected = df.copy() expected.columns = columns tm.assert_frame_equal(result, expected) @@ -1378,25 +1397,34 @@ def f(x, a, b, c=3): @pytest.mark.parametrize("num_cols", [2, 3, 5]) -def test_frequency_is_original(num_cols): +def test_frequency_is_original(num_cols, engine, request): # GH 22150 + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine only supports numeric indices") + request.node.add_marker(mark) index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) original = index.copy() - df = DataFrame(1, index=index, columns=range(num_cols)) + df = DataFrame(1, index=index, columns=range(num_cols), engine=engine) df.apply(lambda x: x) assert index.freq == original.freq -def test_apply_datetime_tz_issue(): +def test_apply_datetime_tz_issue(engine, request): # GH 29052 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine doesn't support non-numeric indexes" + ) + request.node.add_marker(mark) + timestamps = [ Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), ] df = DataFrame(data=[0, 1, 2], index=timestamps) - result = df.apply(lambda x: x.name, axis=1) + result = df.apply(lambda x: x.name, axis=1, engine=engine) expected = Series(index=timestamps, data=timestamps) tm.assert_series_equal(result, expected) @@ -1459,10 +1487,10 @@ def test_apply_empty_list_reduce(): tm.assert_series_equal(result, expected) -def test_apply_no_suffix_index(): +def test_apply_no_suffix_index(engine): # GH36189 pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) - result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine) expected = DataFrame( {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] ) @@ -1511,10 +1539,12 @@ def sum_div2(s): tm.assert_frame_equal(result, expected) -def test_apply_getitem_axis_1(): +def test_apply_getitem_axis_1(engine): # GH 13427 df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]}) - result = df[["a", "a"]].apply(lambda x: x.iloc[0] + x.iloc[1], axis=1) + result = df[["a", "a"]].apply( + lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine + ) expected = Series([0, 2, 4]) tm.assert_series_equal(result, expected) @@ -1554,10 +1584,10 @@ def test_apply_type(): tm.assert_series_equal(result, expected) -def test_apply_on_empty_dataframe(): +def test_apply_on_empty_dataframe(engine): # GH 39111 df = DataFrame({"a": [1, 2], "b": [3, 0]}) - result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1) + result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1, engine=engine) expected = Series([], dtype=np.float64) tm.assert_series_equal(result, expected) @@ -1655,14 +1685,3 @@ def test_agg_dist_like_and_nonunique_columns(): result = df.agg({"A": "count"}) expected = df["A"].count() tm.assert_series_equal(result, expected) - - -def test_numba_unsupported(): - df = DataFrame( - {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} - ) - with pytest.raises( - ValueError, - match="The numba engine in DataFrame.apply can only be used when raw=True", - ): - df.apply(lambda x: x, engine="numba", raw=False) From 55df7ad3e0b7da6cd056803381b2e937606bc82c Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 24 Sep 2023 12:39:08 -0400 Subject: [PATCH 04/18] fix existing tests --- pandas/core/apply.py | 20 ++++++++++++++++---- pandas/tests/apply/test_frame_apply.py | 16 ++++++++++++---- pandas/tests/apply/test_numba.py | 0 3 files changed, 28 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/apply/test_numba.py diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 8bb5e8efa16c7..b550e576587df 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -123,6 +123,8 @@ def __init__( result_type: str | None, *, by_row: Literal[False, "compat", "_compat"] = "compat", + engine: str = "python", + engine_kwargs: dict[str, bool] | None = None, args, kwargs, ) -> None: @@ -135,6 +137,9 @@ def __init__( self.args = args or () self.kwargs = kwargs or {} + self.engine = engine + self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs + if result_type not in [None, "reduce", "broadcast", "expand"]: raise ValueError( "invalid value for result_type, must be one " @@ -777,10 +782,16 @@ def __init__( ) -> None: if by_row is not False and by_row != "compat": raise ValueError(f"by_row={by_row} not allowed") - self.engine = engine - self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs super().__init__( - obj, func, raw, result_type, by_row=by_row, args=args, kwargs=kwargs + obj, + func, + raw, + result_type, + by_row=by_row, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, ) # --------------------------------------------------------------- @@ -1108,6 +1119,7 @@ def generate_numba_apply_func( # Dummy import just to make the extensions loaded in # This isn't an entrypoint since we don't want users # using Series/DF in numba code outside of apply + from pandas.core._numba.extensions import SeriesType # noqa: F401 numba = import_optional_dependency("numba") @@ -1231,8 +1243,8 @@ def generate_numba_apply_func( # Dummy import just to make the extensions loaded in # This isn't an entrypoint since we don't want users # using Series/DF in numba code outside of apply - from pandas import Series + from pandas.core._numba.extensions import SeriesType # noqa: F401 numba = import_optional_dependency("numba") diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 474cc862e3ff8..8767c7f0f45d1 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -49,7 +49,10 @@ def test_apply(float_frame, engine, request): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) -def test_apply_args(float_frame, axis, raw, engine): +def test_apply_args(float_frame, axis, raw, engine, request): + if engine == "numba": + mark = pytest.mark.xfail(reason="numba engine doesn't support args") + request.node.add_marker(mark) result = float_frame.apply( lambda x, y: x + y, axis, args=(1,), raw=raw, engine=engine ) @@ -1407,8 +1410,8 @@ def test_frequency_is_original(num_cols, engine, request): request.node.add_marker(mark) index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"]) original = index.copy() - df = DataFrame(1, index=index, columns=range(num_cols), engine=engine) - df.apply(lambda x: x) + df = DataFrame(1, index=index, columns=range(num_cols)) + df.apply(lambda x: x, engine=engine) assert index.freq == original.freq @@ -1490,8 +1493,13 @@ def test_apply_empty_list_reduce(): tm.assert_series_equal(result, expected) -def test_apply_no_suffix_index(engine): +def test_apply_no_suffix_index(engine, request): # GH36189 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine doesn't support list-likes/dict-like callables" + ) + request.node.add_marker(mark) pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"]) result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine) expected = DataFrame( diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 3c89b0f68c841cad8da7b2bd064ff26c61d543c1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 25 Sep 2023 09:57:13 -0400 Subject: [PATCH 05/18] go for green --- pandas/core/_numba/extensions.py | 33 +++++++++++---- pandas/core/apply.py | 59 +++++++++++++++------------ pyright_reportGeneralTypeIssues.json | 1 + scripts/validate_unwanted_patterns.py | 2 + 4 files changed, 62 insertions(+), 33 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index a23d63b6cdec2..b6d0534368110 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -1,3 +1,6 @@ +# Disable type checking for this module since numba's internals +# are not typed, and we use numba's internals via its extension API +# mypy: ignore-errors """ Utility classes/functions to let numba recognize pandas Index/Series/DataFrame @@ -38,14 +41,18 @@ # TODO: Range index support # (not passing an index to series constructor doesn't work) -class IndexType(types.Buffer): +class IndexType(types.Type): """ The type class for Index objects. """ - def __init__(self, dtype, layout, pyclass) -> None: + def __init__(self, dtype, layout, pyclass: any) -> None: self.pyclass = pyclass - super().__init__(dtype, 1, layout) + name = f"index({dtype}, {layout})" + self.dtype = dtype + self.layout = layout + super().__init__(name) + # super().__init__(dtype, 1, layout) @property def key(self): @@ -190,7 +197,7 @@ def pdseries_constructor(context, builder, sig, args): @lower_builtin(Series, types.Array, IndexType, types.intp) @lower_builtin(Series, types.Array, IndexType, types.float64) @lower_builtin(Series, types.Array, IndexType, types.unicode_type) -def pdseries_constructor(context, builder, sig, args): +def pdseries_constructor_with_name(context, builder, sig, args): data, index, name = args series = cgutils.create_struct_proxy(sig.return_type)(context, builder) series.index = index @@ -351,7 +358,9 @@ def box_series(typ, val, c): name_obj = c.box(typ.namety, series.name) true_obj = c.pyapi.unserialize(c.pyapi.serialize_object(True)) # TODO: Is borrowing none here safe? - # This is equivalent of pd.Series(data=array_obj, index=index_obj, dtype=None, name=name_obj, copy=None, fastpath=True) + # This is equivalent of + # pd.Series(data=array_obj, index=index_obj, dtype=None, + # name=name_obj, copy=None, fastpath=True) series_obj = c.pyapi.call_function_objargs( class_obj, ( @@ -435,7 +444,7 @@ def series_binop_impl(series1, value): @overload_method(IndexType, "get_loc") def index_get_loc(index, item): def index_get_loc_impl(index, item): - # Initialize the hash table if not initalized + # Initialize the hash table if not initialized if len(index.hashmap) == 0: for i, val in enumerate(index._data): index.hashmap[val] = i @@ -444,7 +453,7 @@ def index_get_loc_impl(index, item): return index_get_loc_impl -# Indexing for Series +# Indexing for Series/Index @overload(operator.getitem) @@ -458,6 +467,16 @@ def series_getitem(series, item): return series_getitem +@overload(operator.getitem) +def index_indexing(index, idx): + if isinstance(index, IndexType): + + def index_getitem(index, idx): + return index._data[idx] + + return index_getitem + + class IlocType(types.Type): def __init__(self, obj_type) -> None: self.obj_type = obj_type diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b550e576587df..b79399af6af29 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -812,9 +812,11 @@ def result_columns(self) -> Index: def series_generator(self) -> Generator[Series, None, None]: pass - @property + @functools.cache @abc.abstractmethod - def generate_numba_apply_func(self) -> Callable[[npt.NDarray], dict[int, Any]]: + def generate_numba_apply_func( + func, nogil=True, nopython=True, parallel=False + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: pass @abc.abstractmethod @@ -1113,7 +1115,7 @@ def series_generator(self) -> Generator[Series, None, None]: @functools.cache def generate_numba_apply_func( func, nogil=True, nopython=True, parallel=False - ) -> Callable[[npt.NDarray], dict[int, Any]]: + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: from pandas import Series # Dummy import just to make the extensions loaded in @@ -1130,8 +1132,6 @@ def numba_func(values, col_names, df_index): results = {} for j in range(values.shape[1]): # Create the series - # TODO: No need for the str call? - # Need to adapt types to accept UnicodeCharSeq in Series constructor ser = Series(values[:, j], index=df_index, name=str(col_names[j])) results[j] = jitted_udf(ser) return results @@ -1139,18 +1139,27 @@ def numba_func(values, col_names, df_index): return numba_func def apply_with_numba(self) -> dict[int, Any]: - nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs) - col_names_values = self.columns._data - if col_names_values.dtype == object: - if not lib.is_string_array(col_names_values): + nb_func = self.generate_numba_apply_func( + cast(Callable, self.func), **self.engine_kwargs + ) + orig_values = self.columns.to_numpy() + fixed_cols = False + if orig_values.dtype == object: + if not lib.is_string_array(orig_values): raise ValueError( "The numba engine only supports " "using string or numeric column names" ) - col_names_values = col_names_values.astype("U") + col_names_values = orig_values.astype("U") + # Remember to set this back! + self.columns._data = col_names_values + fixed_cols = True df_index = self.obj.index - return dict(nb_func(self.values, col_names_values, df_index)) + res = dict(nb_func(self.values, self.columns, df_index)) + if fixed_cols: + self.columns._data = orig_values + return res @property def result_index(self) -> Index: @@ -1239,7 +1248,7 @@ def series_generator(self) -> Generator[Series, None, None]: @functools.cache def generate_numba_apply_func( func, nogil=True, nopython=True, parallel=False - ) -> Callable[[npt.NDArray, npt.NDArray, npt.NDArray], dict[int, Any]]: + ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: # Dummy import just to make the extensions loaded in # This isn't an entrypoint since we don't want users # using Series/DF in numba code outside of apply @@ -1251,15 +1260,12 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) - def numba_func(values, col_names_index, index_values): + def numba_func(values, col_names_index, index): results = {} - # col_names_index = Index(col_names) for i in range(values.shape[0]): # Create the series # TODO: values corrupted without the copy - ser = Series( - values[i].copy(), index=col_names_index, name=index_values[i] - ) + ser = Series(values[i].copy(), index=col_names_index, name=index[i]) results[i] = jitted_udf(ser) return results @@ -1267,33 +1273,34 @@ def numba_func(values, col_names_index, index_values): return numba_func def apply_with_numba(self) -> dict[int, Any]: - nb_func = self.generate_numba_apply_func(self.func, **self.engine_kwargs) + nb_func = self.generate_numba_apply_func( + cast(Callable, self.func), **self.engine_kwargs + ) # Since numpy/numba doesn't support object array of stringswell # we'll do a sketchy thing where if index._data is object # we convert to string and directly set index._data to that, # setting it back after we call the function fixed_obj_dtype = False - orig_data = self.columns._data + orig_data = self.columns.to_numpy() if self.columns._data.dtype == object: - if not lib.is_string_array(self.columns._data): + if not lib.is_string_array(orig_data): raise ValueError( "The numba engine only supports " "using string or numeric column names" ) # Remember to set this back!!! - self.columns._data = self.columns._data.astype("U") + self.columns._data = orig_data.astype("U") fixed_obj_dtype = True - index_values = self.obj.index.values # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict - result_nb_dict = nb_func(self.values, self.columns, index_values) - result_keys, result_values = result_nb_dict.keys(), result_nb_dict.values() + res = dict(nb_func(self.values, self.columns, self.obj.index)) + if fixed_obj_dtype: self.columns._data = orig_data - return dict(zip(result_keys, result_values)) - # return dict(nb_func(self.values, col_names_values, index_values)) + + return res @property def result_index(self) -> Index: diff --git a/pyright_reportGeneralTypeIssues.json b/pyright_reportGeneralTypeIssues.json index cad43632930ba..c059b9c589ecd 100644 --- a/pyright_reportGeneralTypeIssues.json +++ b/pyright_reportGeneralTypeIssues.json @@ -16,6 +16,7 @@ "pandas/_testing/__init__.py", "pandas/_testing/_io.py", + "pandas/core/_numba/extensions.py", "pandas/core/_numba/kernels/sum_.py", "pandas/core/_numba/kernels/var_.py", "pandas/compat/pickle_compat.py", diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index d765d7bc7dcb9..6e6251425928d 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -51,6 +51,8 @@ "_chained_assignment_msg", "_chained_assignment_method_msg", "_version_meson", + # The numba extensions need this to mock the iloc object + "_iLocIndexer", # TODO(3.0): GH#55043 - remove upon removal of ArrayManager "_get_option", } From 1418d3e743620a3a40659ace96fad6375b29a613 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 25 Sep 2023 13:46:51 -0400 Subject: [PATCH 06/18] fix checks? --- pandas/core/_numba/extensions.py | 10 +++++----- pandas/core/apply.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index b6d0534368110..da3ca86628b6b 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -387,11 +387,11 @@ def box_series(typ, val, c): # and also add common binops (e.g. add, sub, mul, div) -def generate_series_reduction(reduction, reduction_method): - @overload_method(SeriesType, reduction) +def generate_series_reduction(ser_reduction, ser_method): + @overload_method(SeriesType, ser_reduction) def series_reduction(series): def series_reduction_impl(series): - return reduction_method(series.values) + return ser_method(series.values) return series_reduction_impl @@ -436,8 +436,8 @@ def series_binop_impl(series1, value): series_binops = [operator.add, operator.sub, operator.mul, operator.truediv] -for binop in series_binops: - generate_series_binop(binop) +for ser_binop in series_binops: + generate_series_binop(ser_binop) # get_loc on Index diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b79399af6af29..e9bed6d56a9a0 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -812,6 +812,7 @@ def result_columns(self) -> Index: def series_generator(self) -> Generator[Series, None, None]: pass + @staticmethod @functools.cache @abc.abstractmethod def generate_numba_apply_func( From c143c677280fc5c9d455019d29f7daaa67453185 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 25 Sep 2023 14:21:09 -0400 Subject: [PATCH 07/18] fix pyright --- pandas/core/apply.py | 5 +++++ pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e9bed6d56a9a0..73a76e9121c2e 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,3 +1,8 @@ +# pyright: reportUnusedImport=false +# Disabled since there's no way to do an ignore for both pyright +# and ruff, and ruff should be sufficient +# (The reason we need this is because the import of the numba extensions is unused +# but is necessary to register the extensions) from __future__ import annotations import abc diff --git a/pyproject.toml b/pyproject.toml index 4e1c77413efda..929376b42e78e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -735,7 +735,7 @@ pythonVersion = "3.11" typeCheckingMode = "basic" useLibraryCodeForTypes = false include = ["pandas", "typings"] -exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version"] +exclude = ["pandas/tests", "pandas/io/clipboard", "pandas/util/version", "pandas/core/_numba/extensions.py"] # enable subset of "strict" reportDuplicateImport = true reportInconsistentConstructor = true From 0d827c4e2b009e6fc9b49b7ddb38ee3555a60dd7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 28 Sep 2023 16:45:24 -0400 Subject: [PATCH 08/18] update docs --- pandas/core/_numba/extensions.py | 2 -- pandas/core/frame.py | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index da3ca86628b6b..9eb232b29b8bd 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -52,7 +52,6 @@ def __init__(self, dtype, layout, pyclass: any) -> None: self.dtype = dtype self.layout = layout super().__init__(name) - # super().__init__(dtype, 1, layout) @property def key(self): @@ -126,7 +125,6 @@ def typeof_series(val, c): def type_series_constructor(context): def typer(data, index, name=None): if isinstance(index, IndexType) and isinstance(data, types.Array): - # assert data.layout == "C" assert data.ndim == 1 if name is None: name = types.intp diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e32a6d93b023..7931bf232c75a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10001,6 +10001,10 @@ def apply( - nogil (release the GIL inside the JIT compiled function) - parallel (try to apply the function in parallel over the DataFrame) + Note: Due to limitations within numba/how pandas interfaces with numba, + you should only use this if raw=True + + Note: The numba compiler only supports a subset of valid Python/numpy operations. @@ -10010,8 +10014,6 @@ def apply( `_ in numba to learn what you can or cannot use in the passed function. - As of right now, the numba engine can only be used with raw=True. - .. versionadded:: 2.2.0 engine_kwargs : dict From f4e80a6a73ad908c1cfa3847f59667c6c164df19 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 29 Sep 2023 17:51:46 -0400 Subject: [PATCH 09/18] eliminate a blank line --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 681caef6c74fe..033f8e6d913c8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10054,7 +10054,6 @@ def apply( Note: Due to limitations within numba/how pandas interfaces with numba, you should only use this if raw=True - Note: The numba compiler only supports a subset of valid Python/numpy operations. From 21e2186488c533e821dd9c03698de31d8897b676 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 6 Oct 2023 21:02:18 -0400 Subject: [PATCH 10/18] update from code review + more tests --- pandas/core/_numba/extensions.py | 36 ++++++++--- pandas/core/apply.py | 86 ++++++++++++++++++++------ pandas/tests/apply/conftest.py | 12 ++++ pandas/tests/apply/test_frame_apply.py | 22 +++---- pandas/tests/apply/test_numba.py | 64 +++++++++++++++++++ 5 files changed, 183 insertions(+), 37 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index 9eb232b29b8bd..9bcb27b964141 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -13,10 +13,8 @@ import operator import numba -from numba.core import ( - cgutils, - types, -) +from numba import types +from numba.core import cgutils from numba.core.datamodel import models from numba.core.extending import ( NativeValue, @@ -40,7 +38,7 @@ # TODO: Range index support -# (not passing an index to series constructor doesn't work) +# (this currently lowers OK, but does not round-trip) class IndexType(types.Type): """ The type class for Index objects. @@ -149,6 +147,7 @@ def typer(data, hashmap=None): @register_model(IndexType) class IndexModel(models.StructModel): def __init__(self, dmm, fe_type) -> None: + # We don't want the numpy string scalar type in our hashmap members = [ ("data", fe_type.as_array), # This is an attempt to emulate our hashtable code with a numba @@ -240,6 +239,25 @@ def index_impl(data): return context.compile_internal(builder, index_impl, sig, args) +# Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type +# (regular string) + + +def maybe_cast_str(x): + # Dummy function that numba can overload + pass + + +@overload(maybe_cast_str) +def maybe_cast_str_impl(x): + """Converts numba UnicodeCharSeq (numpy string scalar) -> unicode type (string). + Is a no-op for other types.""" + if isinstance(x, types.UnicodeCharSeq): + return lambda x: str(x) + else: + return lambda x: x + + @unbox(IndexType) def unbox_index(typ, obj, c): """ @@ -426,8 +444,12 @@ def series_binop_impl(series1, value): series_reductions = [ ("sum", np.sum), ("mean", np.mean), - ("std", np.std), - ("var", np.var), + # Disabled due to discrepancies between numba std. dev + # and pandas std. dev (no way to specify dof) + # ("std", np.std), + # ("var", np.var), + ("min", np.min), + ("max", np.max), ] for reduction, reduction_method in series_reductions: generate_series_reduction(reduction, reduction_method) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 73a76e9121c2e..71c60fd60ad8a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1075,6 +1075,14 @@ def apply_series_generator(self) -> tuple[ResType, Index]: return results, res_index def apply_series_numba(self): + if self.engine_kwargs.get("parallel", False): + raise NotImplementedError( + "Parallel apply is not supported when raw=False and engine='numba'" + ) + if not self.obj.index.is_unique or not self.columns.is_unique: + raise NotImplementedError( + "The index/columns must be unique when raw=False and engine='numba'" + ) results = self.apply_with_numba() return results, self.result_index @@ -1128,6 +1136,7 @@ def generate_numba_apply_func( # This isn't an entrypoint since we don't want users # using Series/DF in numba code outside of apply from pandas.core._numba.extensions import SeriesType # noqa: F401 + from pandas.core._numba.extensions import maybe_cast_str numba = import_optional_dependency("numba") @@ -1138,7 +1147,9 @@ def numba_func(values, col_names, df_index): results = {} for j in range(values.shape[1]): # Create the series - ser = Series(values[:, j], index=df_index, name=str(col_names[j])) + ser = Series( + values[:, j], index=df_index, name=maybe_cast_str(col_names[j]) + ) results[j] = jitted_udf(ser) return results @@ -1148,23 +1159,40 @@ def apply_with_numba(self) -> dict[int, Any]: nb_func = self.generate_numba_apply_func( cast(Callable, self.func), **self.engine_kwargs ) - orig_values = self.columns.to_numpy() - fixed_cols = False - if orig_values.dtype == object: - if not lib.is_string_array(orig_values): + # Since numpy/numba doesn't support object array of stringswell + # we'll do a sketchy thing where if index._data is object + # we convert to string and directly set index._data to that, + # setting it back after we call the function + fixed_obj_colnames = False + orig_cols = self.columns.to_numpy() + if self.columns._data.dtype == object: + if not lib.is_string_array(orig_cols): raise ValueError( "The numba engine only supports " "using string or numeric column names" ) - col_names_values = orig_values.astype("U") - # Remember to set this back! - self.columns._data = col_names_values - fixed_cols = True + # Remember to set this back!!! + self.columns._data = orig_cols.astype("U") + fixed_obj_colnames = True + + fixed_obj_index = False + orig_index = self.index.to_numpy() + if self.obj.index._data.dtype == object: + if not lib.is_string_array(orig_index): + raise ValueError( + "The numba engine only supports " + "using string or numeric index values" + ) + # Remember to set this back!!! + self.obj.index._data = orig_index.astype("U") + fixed_obj_index = True df_index = self.obj.index res = dict(nb_func(self.values, self.columns, df_index)) - if fixed_cols: - self.columns._data = orig_values + if fixed_obj_colnames: + self.columns._data = orig_cols + if fixed_obj_index: + self.obj.index._data = orig_index return res @property @@ -1260,6 +1288,7 @@ def generate_numba_apply_func( # using Series/DF in numba code outside of apply from pandas import Series from pandas.core._numba.extensions import SeriesType # noqa: F401 + from pandas.core._numba.extensions import maybe_cast_str numba = import_optional_dependency("numba") @@ -1271,7 +1300,11 @@ def numba_func(values, col_names_index, index): for i in range(values.shape[0]): # Create the series # TODO: values corrupted without the copy - ser = Series(values[i].copy(), index=col_names_index, name=index[i]) + ser = Series( + values[i].copy(), + index=col_names_index, + name=maybe_cast_str(index[i]), + ) results[i] = jitted_udf(ser) return results @@ -1287,24 +1320,39 @@ def apply_with_numba(self) -> dict[int, Any]: # we'll do a sketchy thing where if index._data is object # we convert to string and directly set index._data to that, # setting it back after we call the function - fixed_obj_dtype = False - orig_data = self.columns.to_numpy() + fixed_obj_colnames = False + orig_cols = self.columns.to_numpy() if self.columns._data.dtype == object: - if not lib.is_string_array(orig_data): + if not lib.is_string_array(orig_cols): raise ValueError( "The numba engine only supports " "using string or numeric column names" ) # Remember to set this back!!! - self.columns._data = orig_data.astype("U") - fixed_obj_dtype = True + self.columns._data = orig_cols.astype("U") + fixed_obj_colnames = True + + fixed_obj_index = False + orig_index = self.index.to_numpy() + if self.obj.index._data.dtype == object: + if not lib.is_string_array(orig_index): + raise ValueError( + "The numba engine only supports " + "using string or numeric index values" + ) + # Remember to set this back!!! + self.obj.index._data = orig_index.astype("U") + fixed_obj_index = True # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict res = dict(nb_func(self.values, self.columns, self.obj.index)) - if fixed_obj_dtype: - self.columns._data = orig_data + if fixed_obj_colnames: + self.columns._data = orig_cols + + if fixed_obj_index: + self.obj.index._data = orig_index return res diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py index b68c6235cb0b8..7ed9fc88c3aea 100644 --- a/pandas/tests/apply/conftest.py +++ b/pandas/tests/apply/conftest.py @@ -16,3 +16,15 @@ def int_frame_const_col(): columns=["A", "B", "C"], ) return df + + +@pytest.fixture(params=["python", "numba"]) +def engine(request): + if request.param == "numba": + pytest.importorskip("numba") + return request.param + + +@pytest.fixture(params=[0, 1]) +def apply_axis(request): + return request.param diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 8767c7f0f45d1..a3a95ccd75064 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -18,13 +18,6 @@ from pandas.tests.frame.common import zip_frames -@pytest.fixture(params=["python", "numba"]) -def engine(request): - if request.param == "numba": - pytest.importorskip("numba") - return request.param - - def test_apply(float_frame, engine, request): if engine == "numba": mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet") @@ -102,7 +95,7 @@ def test_apply_mixed_datetimelike(): @pytest.mark.parametrize("func", [np.sqrt, np.mean]) -def test_apply_empty(func, engine=engine): +def test_apply_empty(func, engine): # empty empty_frame = DataFrame() @@ -983,7 +976,7 @@ def test_result_type_shorter_list(int_frame_const_col): tm.assert_frame_equal(result, expected) -def test_result_type_broadcast(int_frame_const_col, request): +def test_result_type_broadcast(int_frame_const_col, request, engine): # result_type should be consistent no matter which # path we take in the code if engine == "numba": @@ -991,7 +984,9 @@ def test_result_type_broadcast(int_frame_const_col, request): request.node.add_marker(mark) df = int_frame_const_col # broadcast result - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") + result = df.apply( + lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine + ) expected = df.copy() tm.assert_frame_equal(result, expected) @@ -1550,8 +1545,13 @@ def sum_div2(s): tm.assert_frame_equal(result, expected) -def test_apply_getitem_axis_1(engine): +def test_apply_getitem_axis_1(engine, request): # GH 13427 + if engine == "numba": + mark = pytest.mark.xfail( + reason="numba engine not supporting duplicate index values" + ) + request.node.add_marker(mark) df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]}) result = df[["a", "a"]].apply( lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index e69de29bb2d1d..9e05b5316dc15 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -0,0 +1,64 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +def test_numba_vs_python_noop(float_frame, apply_axis): + func = lambda x: x + result = float_frame.apply(func, engine="numba", axis=apply_axis) + expected = float_frame.apply(func, engine="python", axis=apply_axis) + tm.assert_frame_equal(result, expected) + + +def test_numba_vs_python_indexing(float_frame): + row_func = lambda x: x["A"] + result = float_frame.apply(row_func, engine="numba", axis=1) + expected = float_frame.apply(row_func, engine="python", axis=1) + tm.assert_series_equal(result, expected) + + row_func = lambda x: x["ZqgszYBfuL"] # This is a label in the index + result = float_frame.apply(row_func, engine="numba", axis=0) + expected = float_frame.apply(row_func, engine="python", axis=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "reduction", + [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()], +) +def test_numba_vs_python_reductions(float_frame, reduction, apply_axis): + result = float_frame.apply(reduction, engine="numba", axis=apply_axis) + expected = float_frame.apply(reduction, engine="python", axis=apply_axis) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]]) +def test_numba_numeric_colnames(colnames): + # Check that numeric column names lower properly and can be indxed on + df = DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=colnames) + first_col = colnames[0] + f = lambda x: x[first_col] # Get the first column + result = df.apply(f, engine="numba", axis=1) + expected = df.apply(f, engine="python", axis=1) + tm.assert_series_equal(result, expected) + + +def test_numba_parallel_unsupported(float_frame): + f = lambda x: x + with pytest.raises( + NotImplementedError, + match="Parallel apply is not supported when raw=False and engine='numba'", + ): + float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True}) + + +def test_numba_nonunique_unsupported(): + f = lambda x: x + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + with pytest.raises( + NotImplementedError, + match="The index/columns must be unique when raw=False and engine='numba'", + ): + df.apply(f, engine="numba", engine_kwargs={"parallel": True}) From ba1d0e0b31767928c6e95e7353325bdd8a262d79 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 10 Oct 2023 10:02:30 -0400 Subject: [PATCH 11/18] fix failing tests --- pandas/tests/apply/test_numba.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 9e05b5316dc15..dd186bb8a57bb 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,7 +1,10 @@ import numpy as np import pytest -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm @@ -54,11 +57,11 @@ def test_numba_parallel_unsupported(float_frame): float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True}) -def test_numba_nonunique_unsupported(): +def test_numba_nonunique_unsupported(apply_axis): f = lambda x: x - df = DataFrame({"a": [1, 2], "b": [1, 2]}) + df = DataFrame({"a": [1, 2]}, index=Index(["a", "a"])) with pytest.raises( NotImplementedError, match="The index/columns must be unique when raw=False and engine='numba'", ): - df.apply(f, engine="numba", engine_kwargs={"parallel": True}) + df.apply(f, engine="numba", axis=apply_axis) From 088d27f25e8a892e0aa3ce9336540ce40e4ae7fe Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Oct 2023 11:16:28 -0400 Subject: [PATCH 12/18] Simplify w/ context manager --- pandas/core/_numba/extensions.py | 39 ++++++++++++--- pandas/core/apply.py | 83 ++++++-------------------------- 2 files changed, 45 insertions(+), 77 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index 9bcb27b964141..4e1f59c6d4e96 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -10,6 +10,7 @@ from __future__ import annotations +from contextlib import contextmanager import operator import numba @@ -32,11 +33,34 @@ from numba.core.imputils import impl_ret_borrowed import numpy as np +from pandas._libs import lib + from pandas.core.indexes.base import Index from pandas.core.indexing import _iLocIndexer from pandas.core.series import Series +# Helper function to hack around fact that Index casts numpy string dtype to object +# +# Idea is to set an attribute on a Index called _numba_data +# that is the original data, or the object data casted to numpy string dtype, +# with a context manager that is unset afterwards +@contextmanager +def set_numba_data(index: Index): + numba_data = index._data + if numba_data.dtype == object: + if not lib.is_string_array(numba_data): + raise ValueError( + "The numba engine only supports using string or numeric column names" + ) + numba_data = numba_data.astype("U") + try: + index._numba_data = numba_data + yield index + finally: + del index._numba_data + + # TODO: Range index support # (this currently lowers OK, but does not round-trip) class IndexType(types.Type): @@ -104,7 +128,8 @@ def typeof_index(val, c): index. (you should check this before this gets lowered down to numba) """ - arrty = typeof_impl(val._data, c) + # arrty = typeof_impl(val._data, c) + arrty = typeof_impl(val._numba_data, c) assert arrty.ndim == 1 return IndexType(arrty.dtype, arrty.layout, type(val)) @@ -263,18 +288,17 @@ def unbox_index(typ, obj, c): """ Convert a Index object to a native structure. - If it is object dtype, we'll attempt to cast it to one of - numpy's string dtypes. - (you are responsible for validating that the Index contains only - strings if its object type before lowering it to numba) + Note: Object dtype is not allowed here """ - data_obj = c.pyapi.object_getattr_string(obj, "_data") + # data_obj = c.pyapi.object_getattr_string(obj, "_data") + data_obj = c.pyapi.object_getattr_string(obj, "_numba_data") + # data_obj = c.pyapi.object_getattr_string(obj, "_numba_data") index = cgutils.create_struct_proxy(typ)(c.context, c.builder) # If we see an object array, assume its been validated as only containing strings # We still need to do the conversion though index.data = c.unbox(typ.as_array, data_obj).value typed_dict_obj = c.pyapi.unserialize(c.pyapi.serialize_object(numba.typed.Dict)) - # Create an empty typed dict in numba for the hasmap for indexing + # Create an empty typed dict in numba for the hashmap for indexing # equiv of numba.typed.Dict.empty(typ.dtype, types.intp) arr_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.dtype)) intp_type_obj = c.pyapi.unserialize(c.pyapi.serialize_object(types.intp)) @@ -353,7 +377,6 @@ def box_index(typ, val, c): # this is basically Index._simple_new(array_obj, name_obj) in python index_obj = c.pyapi.call_method(class_obj, "_simple_new", (array_obj,)) index.parent = index_obj - c.pyapi.print_object(index.parent) c.builder.store(index_obj, res) # Decrefs diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 71c60fd60ad8a..06d3e8834c821 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1156,43 +1156,17 @@ def numba_func(values, col_names, df_index): return numba_func def apply_with_numba(self) -> dict[int, Any]: + from pandas.core._numba.extensions import set_numba_data + nb_func = self.generate_numba_apply_func( cast(Callable, self.func), **self.engine_kwargs ) - # Since numpy/numba doesn't support object array of stringswell - # we'll do a sketchy thing where if index._data is object - # we convert to string and directly set index._data to that, - # setting it back after we call the function - fixed_obj_colnames = False - orig_cols = self.columns.to_numpy() - if self.columns._data.dtype == object: - if not lib.is_string_array(orig_cols): - raise ValueError( - "The numba engine only supports " - "using string or numeric column names" - ) - # Remember to set this back!!! - self.columns._data = orig_cols.astype("U") - fixed_obj_colnames = True - - fixed_obj_index = False - orig_index = self.index.to_numpy() - if self.obj.index._data.dtype == object: - if not lib.is_string_array(orig_index): - raise ValueError( - "The numba engine only supports " - "using string or numeric index values" - ) - # Remember to set this back!!! - self.obj.index._data = orig_index.astype("U") - fixed_obj_index = True - df_index = self.obj.index - - res = dict(nb_func(self.values, self.columns, df_index)) - if fixed_obj_colnames: - self.columns._data = orig_cols - if fixed_obj_index: - self.obj.index._data = orig_index + # Convert from numba dict to regular dict + # Our isinstance checks in the df constructor don't pass for numbas typed dict + with set_numba_data(self.obj.index) as index, set_numba_data( + self.columns + ) as columns: + res = dict(nb_func(self.values, columns, index)) return res @property @@ -1312,47 +1286,18 @@ def numba_func(values, col_names_index, index): return numba_func def apply_with_numba(self) -> dict[int, Any]: + from pandas.core._numba.extensions import set_numba_data + nb_func = self.generate_numba_apply_func( cast(Callable, self.func), **self.engine_kwargs ) - # Since numpy/numba doesn't support object array of stringswell - # we'll do a sketchy thing where if index._data is object - # we convert to string and directly set index._data to that, - # setting it back after we call the function - fixed_obj_colnames = False - orig_cols = self.columns.to_numpy() - if self.columns._data.dtype == object: - if not lib.is_string_array(orig_cols): - raise ValueError( - "The numba engine only supports " - "using string or numeric column names" - ) - # Remember to set this back!!! - self.columns._data = orig_cols.astype("U") - fixed_obj_colnames = True - - fixed_obj_index = False - orig_index = self.index.to_numpy() - if self.obj.index._data.dtype == object: - if not lib.is_string_array(orig_index): - raise ValueError( - "The numba engine only supports " - "using string or numeric index values" - ) - # Remember to set this back!!! - self.obj.index._data = orig_index.astype("U") - fixed_obj_index = True - # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict - res = dict(nb_func(self.values, self.columns, self.obj.index)) - - if fixed_obj_colnames: - self.columns._data = orig_cols - - if fixed_obj_index: - self.obj.index._data = orig_index + with set_numba_data(self.obj.index) as index, set_numba_data( + self.columns + ) as columns: + res = dict(nb_func(self.values, columns, index)) return res From 60539a1fa677a4f41068f0fd8fe4f1a277169084 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:28:03 -0400 Subject: [PATCH 13/18] skip if no numba --- pandas/tests/apply/test_numba.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index dd186bb8a57bb..9b89bfdad5b51 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,12 +1,16 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, ) import pandas._testing as tm +pytestmark = td.skip_if_no("numba") + def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x From 76538d696ae5a4ed19418b134b0ed66037e60a19 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:35:50 -0400 Subject: [PATCH 14/18] simplify more --- pandas/core/_numba/extensions.py | 9 --------- pandas/core/apply.py | 29 ++++++++--------------------- 2 files changed, 8 insertions(+), 30 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index 4e1f59c6d4e96..ebe2a752a12f7 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -266,8 +266,6 @@ def index_impl(data): # Helper to convert the unicodecharseq (numpy string scalar) into a unicode_type # (regular string) - - def maybe_cast_str(x): # Dummy function that numba can overload pass @@ -290,9 +288,7 @@ def unbox_index(typ, obj, c): Note: Object dtype is not allowed here """ - # data_obj = c.pyapi.object_getattr_string(obj, "_data") data_obj = c.pyapi.object_getattr_string(obj, "_numba_data") - # data_obj = c.pyapi.object_getattr_string(obj, "_numba_data") index = cgutils.create_struct_proxy(typ)(c.context, c.builder) # If we see an object array, assume its been validated as only containing strings # We still need to do the conversion though @@ -396,7 +392,6 @@ def box_series(typ, val, c): array_obj = c.box(typ.as_array, series.values) name_obj = c.box(typ.namety, series.name) true_obj = c.pyapi.unserialize(c.pyapi.serialize_object(True)) - # TODO: Is borrowing none here safe? # This is equivalent of # pd.Series(data=array_obj, index=index_obj, dtype=None, # name=name_obj, copy=None, fastpath=True) @@ -424,8 +419,6 @@ def box_series(typ, val, c): # Add common series reductions (e.g. mean, sum), # and also add common binops (e.g. add, sub, mul, div) - - def generate_series_reduction(ser_reduction, ser_method): @overload_method(SeriesType, ser_reduction) def series_reduction(series): @@ -497,8 +490,6 @@ def index_get_loc_impl(index, item): # Indexing for Series/Index - - @overload(operator.getitem) def series_indexing(series, item): if isinstance(series, SeriesType): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 06d3e8834c821..ace37c678b639 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,8 +1,3 @@ -# pyright: reportUnusedImport=false -# Disabled since there's no way to do an ignore for both pyright -# and ruff, and ruff should be sufficient -# (The reason we need this is because the import of the numba extensions is unused -# but is necessary to register the extensions) from __future__ import annotations import abc @@ -1130,16 +1125,13 @@ def series_generator(self) -> Generator[Series, None, None]: def generate_numba_apply_func( func, nogil=True, nopython=True, parallel=False ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: + numba = import_optional_dependency("numba") from pandas import Series - # Dummy import just to make the extensions loaded in - # This isn't an entrypoint since we don't want users - # using Series/DF in numba code outside of apply - from pandas.core._numba.extensions import SeriesType # noqa: F401 + # Import helper from extensions to cast string object -> np strings + # Note: This also has the side effect of loading our numba extensions from pandas.core._numba.extensions import maybe_cast_str - numba = import_optional_dependency("numba") - jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) @@ -1156,11 +1148,11 @@ def numba_func(values, col_names, df_index): return numba_func def apply_with_numba(self) -> dict[int, Any]: - from pandas.core._numba.extensions import set_numba_data - nb_func = self.generate_numba_apply_func( cast(Callable, self.func), **self.engine_kwargs ) + from pandas.core._numba.extensions import set_numba_data + # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict with set_numba_data(self.obj.index) as index, set_numba_data( @@ -1257,15 +1249,10 @@ def series_generator(self) -> Generator[Series, None, None]: def generate_numba_apply_func( func, nogil=True, nopython=True, parallel=False ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]: - # Dummy import just to make the extensions loaded in - # This isn't an entrypoint since we don't want users - # using Series/DF in numba code outside of apply + numba = import_optional_dependency("numba") from pandas import Series - from pandas.core._numba.extensions import SeriesType # noqa: F401 from pandas.core._numba.extensions import maybe_cast_str - numba = import_optional_dependency("numba") - jitted_udf = numba.extending.register_jitable(func) @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) @@ -1286,12 +1273,12 @@ def numba_func(values, col_names_index, index): return numba_func def apply_with_numba(self) -> dict[int, Any]: - from pandas.core._numba.extensions import set_numba_data - nb_func = self.generate_numba_apply_func( cast(Callable, self.func), **self.engine_kwargs ) + from pandas.core._numba.extensions import set_numba_data + # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict with set_numba_data(self.obj.index) as index, set_numba_data( From cca34f937e0de1ca298965679e97468a9c399657 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 12 Oct 2023 18:40:00 -0400 Subject: [PATCH 15/18] specify dtypes --- pandas/tests/apply/test_numba.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 9b89bfdad5b51..5ae57f18d8467 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -44,7 +44,9 @@ def test_numba_vs_python_reductions(float_frame, reduction, apply_axis): @pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]]) def test_numba_numeric_colnames(colnames): # Check that numeric column names lower properly and can be indxed on - df = DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=colnames) + df = DataFrame( + np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=colnames + ) first_col = colnames[0] f = lambda x: x[first_col] # Get the first column result = df.apply(f, engine="numba", axis=1) From f86024fbe094a0942e4474d60309130543670687 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 16 Oct 2023 16:57:11 -0400 Subject: [PATCH 16/18] address code review --- pandas/core/apply.py | 4 ++++ pandas/tests/apply/test_numba.py | 18 +++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ace37c678b639..c8569fff564ba 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1134,6 +1134,8 @@ def generate_numba_apply_func( jitted_udf = numba.extending.register_jitable(func) + # Currently the parallel argument doesn't get passed through here + # (it's disabled) since the dicts in numba aren't thread-safe. @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) def numba_func(values, col_names, df_index): results = {} @@ -1258,6 +1260,8 @@ def generate_numba_apply_func( @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel) def numba_func(values, col_names_index, index): results = {} + # Currently the parallel argument doesn't get passed through here + # (it's disabled) since the dicts in numba aren't thread-safe. for i in range(values.shape[0]): # Create the series # TODO: values corrupted without the copy diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 5ae57f18d8467..ba317b2a9fc1b 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -19,15 +19,19 @@ def test_numba_vs_python_noop(float_frame, apply_axis): tm.assert_frame_equal(result, expected) -def test_numba_vs_python_indexing(float_frame): - row_func = lambda x: x["A"] - result = float_frame.apply(row_func, engine="numba", axis=1) - expected = float_frame.apply(row_func, engine="python", axis=1) +def test_numba_vs_python_indexing(): + frame = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, + index=Index(["A", "B", "C"]), + ) + row_func = lambda x: x["c"] + result = frame.apply(row_func, engine="numba", axis=1) + expected = frame.apply(row_func, engine="python", axis=1) tm.assert_series_equal(result, expected) - row_func = lambda x: x["ZqgszYBfuL"] # This is a label in the index - result = float_frame.apply(row_func, engine="numba", axis=0) - expected = float_frame.apply(row_func, engine="python", axis=0) + col_func = lambda x: x["A"] + result = frame.apply(col_func, engine="numba", axis=0) + expected = frame.apply(col_func, engine="python", axis=0) tm.assert_series_equal(result, expected) From a15293d5f718d411b26e9af37654e5eb4d4be784 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 18 Oct 2023 21:50:16 -0400 Subject: [PATCH 17/18] add errors for invalid columns --- pandas/core/apply.py | 17 +++++++++++++++++ pandas/tests/apply/test_numba.py | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c8569fff564ba..dec1d0bc3f78f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -38,7 +38,9 @@ from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( is_dict_like, + is_extension_array_dtype, is_list_like, + is_numeric_dtype, is_sequence, ) from pandas.core.dtypes.dtypes import ( @@ -824,6 +826,20 @@ def generate_numba_apply_func( def apply_with_numba(self): pass + def validate_values_for_numba(self): + # Validate column dtyps all OK + for colname, dtype in self.obj.dtypes.items(): + if not is_numeric_dtype(dtype): + raise ValueError( + f"Column {colname} must have a numeric dtype." + f"Found '{dtype}' instead" + ) + if is_extension_array_dtype(dtype): + raise ValueError( + f"Column {colname} is backed by an extension array," + f"which is not supported by the numba engine." + ) + @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: Index @@ -1078,6 +1094,7 @@ def apply_series_numba(self): raise NotImplementedError( "The index/columns must be unique when raw=False and engine='numba'" ) + self.validate_values_for_numba() results = self.apply_with_numba() return results, self.result_index diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index ba317b2a9fc1b..5537bca583eba 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -75,3 +75,21 @@ def test_numba_nonunique_unsupported(apply_axis): match="The index/columns must be unique when raw=False and engine='numba'", ): df.apply(f, engine="numba", axis=apply_axis) + + +def test_numba_unsupported_dtypes(apply_axis): + f = lambda x: x + df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]}) + df["c"] = df["c"].astype("double[pyarrow]") + + with pytest.raises( + ValueError, match="Column b must have a numeric dtype. Found 'object' instead" + ): + df.apply(f, engine="numba", axis=apply_axis) + + with pytest.raises( + ValueError, + match="Column c is backed by an extension array," + "which is not supported by the numba engine.", + ): + df["c"].to_frame().apply(f, engine="numba", axis=apply_axis) From 8fe5d8964a59df5b2b69298ece8a009ffba96455 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 19 Oct 2023 12:11:48 -0400 Subject: [PATCH 18/18] adjust message --- pandas/core/apply.py | 4 ++-- pandas/tests/apply/test_numba.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index dec1d0bc3f78f..3b79882d3c762 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -831,12 +831,12 @@ def validate_values_for_numba(self): for colname, dtype in self.obj.dtypes.items(): if not is_numeric_dtype(dtype): raise ValueError( - f"Column {colname} must have a numeric dtype." + f"Column {colname} must have a numeric dtype. " f"Found '{dtype}' instead" ) if is_extension_array_dtype(dtype): raise ValueError( - f"Column {colname} is backed by an extension array," + f"Column {colname} is backed by an extension array, " f"which is not supported by the numba engine." ) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 5537bca583eba..7e1e44d2119f9 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -89,7 +89,7 @@ def test_numba_unsupported_dtypes(apply_axis): with pytest.raises( ValueError, - match="Column c is backed by an extension array," + match="Column c is backed by an extension array, " "which is not supported by the numba engine.", ): df["c"].to_frame().apply(f, engine="numba", axis=apply_axis)