diff --git a/mkl_fft/_pydfti.pyx b/mkl_fft/_pydfti.pyx index 58947e3..f2826b2 100644 --- a/mkl_fft/_pydfti.pyx +++ b/mkl_fft/_pydfti.pyx @@ -34,18 +34,50 @@ except ImportError: from numpy.core._multiarray_tests import internal_overlap from libc.string cimport memcpy +cimport cpython.pycapsule +from cpython.exc cimport (PyErr_Occurred, PyErr_Clear) +from cpython.mem cimport (PyMem_Malloc, PyMem_Free) + +from threading import local as threading_local + +# thread-local storage +_tls = threading_local() + +cdef const char *capsule_name = "dfti_cache" + +cdef void _capsule_destructor(object caps): + cdef DftiCache *_cache = NULL + cdef int status = 0 + if (caps is None): + print("Nothing to destroy") + return + _cache = cpython.pycapsule.PyCapsule_GetPointer(caps, capsule_name) + status = _free_dfti_cache(_cache) + PyMem_Free(_cache) + if (status != 0): + raise ValueError("Internal Error: Freeing DFTI Cache returned with error = {}".format(status)) + + +def _tls_dfti_cache_capsule(): + cdef DftiCache *_cache_struct + + init = getattr(_tls, 'initialized', None) + if (init is None): + _cache_struct = PyMem_Malloc(sizeof(DftiCache)); + # important to initialized + _cache_struct.initialized = 0 + _cache_struct.hand = NULL + _tls.initialized = True + _tls.capsule = cpython.pycapsule.PyCapsule_New(_cache_struct, capsule_name, &_capsule_destructor) + capsule = getattr(_tls, 'capsule', None) + if (not cpython.pycapsule.PyCapsule_IsValid(capsule, capsule_name)): + raise ValueError("Internal Error: invalid capsule stored in TLS") + return capsule -from threading import Lock -_lock = Lock() cdef extern from "Python.h": ctypedef int size_t - void* PyMem_Malloc(size_t n) - void PyMem_Free(void* buf) - - int PyErr_Occurred() - void PyErr_Clear() long PyInt_AsLong(object ob) int PyObject_HasAttrString(object, char*) @@ -58,32 +90,36 @@ cdef extern from *: object PyArray_BASE(cnp.ndarray) cdef extern from "src/mklfft.h": - int cdouble_mkl_fft1d_in(cnp.ndarray, int, int) - int cfloat_mkl_fft1d_in(cnp.ndarray, int, int) - int float_cfloat_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray, int) - int cfloat_cfloat_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray) - int double_cdouble_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray, int) - int cdouble_cdouble_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray) - - int cdouble_mkl_ifft1d_in(cnp.ndarray, int, int) - int cfloat_mkl_ifft1d_in(cnp.ndarray, int, int) - int float_cfloat_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarray, int) - int cfloat_cfloat_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarray) - int double_cdouble_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarray, int) - int cdouble_cdouble_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarray) - - int double_mkl_rfft_in(cnp.ndarray, int, int) - int double_mkl_irfft_in(cnp.ndarray, int, int) - int float_mkl_rfft_in(cnp.ndarray, int, int) - int float_mkl_irfft_in(cnp.ndarray, int, int) - - int double_double_mkl_rfft_out(cnp.ndarray, int, int, cnp.ndarray) - int double_double_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray) - int float_float_mkl_rfft_out(cnp.ndarray, int, int, cnp.ndarray) - int float_float_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray) - - int cdouble_double_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray) - int cfloat_float_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray) + cdef struct DftiCache: + void * hand + int initialized + int _free_dfti_cache(DftiCache *) + int cdouble_mkl_fft1d_in(cnp.ndarray, int, int, DftiCache*) + int cfloat_mkl_fft1d_in(cnp.ndarray, int, int, DftiCache*) + int float_cfloat_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray, int, DftiCache*) + int cfloat_cfloat_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + int double_cdouble_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray, int, DftiCache*) + int cdouble_cdouble_mkl_fft1d_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + + int cdouble_mkl_ifft1d_in(cnp.ndarray, int, int, DftiCache*) + int cfloat_mkl_ifft1d_in(cnp.ndarray, int, int, DftiCache*) + int float_cfloat_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarray, int, DftiCache*) + int cfloat_cfloat_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarra, DftiCache*) + int double_cdouble_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarray, int, DftiCache*) + int cdouble_cdouble_mkl_ifft1d_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + + int double_mkl_rfft_in(cnp.ndarray, int, int, DftiCache*) + int double_mkl_irfft_in(cnp.ndarray, int, int, DftiCache*) + int float_mkl_rfft_in(cnp.ndarray, int, int, DftiCache*) + int float_mkl_irfft_in(cnp.ndarray, int, int, DftiCache*) + + int double_double_mkl_rfft_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + int double_double_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + int float_float_mkl_rfft_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + int float_float_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + + int cdouble_double_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) + int cfloat_float_mkl_irfft_out(cnp.ndarray, int, int, cnp.ndarray, DftiCache*) int cdouble_cdouble_mkl_fftnd_in(cnp.ndarray) int cdouble_cdouble_mkl_ifftnd_in(cnp.ndarray) @@ -268,6 +304,7 @@ def _fft1d_impl(x, n=None, axis=-1, overwrite_arg=False, direction=+1): cdef int ALL_HARMONICS = 1 cdef char * c_error_msg = NULL cdef bytes py_error_msg + cdef DftiCache *_cache x_arr = __process_arguments(x, n, axis, overwrite_arg, direction, &axis_, &n_, &in_place, &xnd, &dir_, 0) @@ -295,19 +332,20 @@ def _fft1d_impl(x, n=None, axis=-1, overwrite_arg=False, direction=+1): in_place = 1 if in_place: - with _lock: - if x_type is cnp.NPY_CDOUBLE: - if dir_ < 0: - status = cdouble_mkl_ifft1d_in(x_arr, n_, axis_) - else: - status = cdouble_mkl_fft1d_in(x_arr, n_, axis_) - elif x_type is cnp.NPY_CFLOAT: - if dir_ < 0: - status = cfloat_mkl_ifft1d_in(x_arr, n_, axis_) - else: - status = cfloat_mkl_fft1d_in(x_arr, n_, axis_) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + if x_type is cnp.NPY_CDOUBLE: + if dir_ < 0: + status = cdouble_mkl_ifft1d_in(x_arr, n_, axis_, _cache) + else: + status = cdouble_mkl_fft1d_in(x_arr, n_, axis_, _cache) + elif x_type is cnp.NPY_CFLOAT: + if dir_ < 0: + status = cfloat_mkl_ifft1d_in(x_arr, n_, axis_, _cache) else: - status = 1 + status = cfloat_mkl_fft1d_in(x_arr, n_, axis_, _cache) + else: + status = 1 if status: c_error_msg = mkl_dfti_error(status) @@ -327,37 +365,38 @@ def _fft1d_impl(x, n=None, axis=-1, overwrite_arg=False, direction=+1): f_arr = __allocate_result(x_arr, n_, axis_, f_type); # call out-of-place FFT - with _lock: - if f_type is cnp.NPY_CDOUBLE: - if x_type is cnp.NPY_DOUBLE: - if dir_ < 0: - status = double_cdouble_mkl_ifft1d_out( - x_arr, n_, axis_, f_arr, ALL_HARMONICS) - else: - status = double_cdouble_mkl_fft1d_out( - x_arr, n_, axis_, f_arr, ALL_HARMONICS) - elif x_type is cnp.NPY_CDOUBLE: - if dir_ < 0: - status = cdouble_cdouble_mkl_ifft1d_out( - x_arr, n_, axis_, f_arr) - else: - status = cdouble_cdouble_mkl_fft1d_out( - x_arr, n_, axis_, f_arr) - else: - if x_type is cnp.NPY_FLOAT: - if dir_ < 0: - status = float_cfloat_mkl_ifft1d_out( - x_arr, n_, axis_, f_arr, ALL_HARMONICS) - else: - status = float_cfloat_mkl_fft1d_out( - x_arr, n_, axis_, f_arr, ALL_HARMONICS) - elif x_type is cnp.NPY_CFLOAT: - if dir_ < 0: - status = cfloat_cfloat_mkl_ifft1d_out( - x_arr, n_, axis_, f_arr) - else: - status = cfloat_cfloat_mkl_fft1d_out( - x_arr, n_, axis_, f_arr) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + if f_type is cnp.NPY_CDOUBLE: + if x_type is cnp.NPY_DOUBLE: + if dir_ < 0: + status = double_cdouble_mkl_ifft1d_out( + x_arr, n_, axis_, f_arr, ALL_HARMONICS, _cache) + else: + status = double_cdouble_mkl_fft1d_out( + x_arr, n_, axis_, f_arr, ALL_HARMONICS, _cache) + elif x_type is cnp.NPY_CDOUBLE: + if dir_ < 0: + status = cdouble_cdouble_mkl_ifft1d_out( + x_arr, n_, axis_, f_arr, _cache) + else: + status = cdouble_cdouble_mkl_fft1d_out( + x_arr, n_, axis_, f_arr, _cache) + else: + if x_type is cnp.NPY_FLOAT: + if dir_ < 0: + status = float_cfloat_mkl_ifft1d_out( + x_arr, n_, axis_, f_arr, ALL_HARMONICS, _cache) + else: + status = float_cfloat_mkl_fft1d_out( + x_arr, n_, axis_, f_arr, ALL_HARMONICS, _cache) + elif x_type is cnp.NPY_CFLOAT: + if dir_ < 0: + status = cfloat_cfloat_mkl_ifft1d_out( + x_arr, n_, axis_, f_arr, _cache) + else: + status = cfloat_cfloat_mkl_fft1d_out( + x_arr, n_, axis_, f_arr, _cache) if (status): c_error_msg = mkl_dfti_error(status) @@ -388,6 +427,7 @@ def _rrfft1d_impl(x, n=None, axis=-1, overwrite_arg=False, direction=+1): cdef int x_type, status cdef char * c_error_msg = NULL cdef bytes py_error_msg + cdef DftiCache *_cache x_arr = __process_arguments(x, n, axis, overwrite_arg, direction, &axis_, &n_, &in_place, &xnd, &dir_, 1) @@ -413,19 +453,20 @@ def _rrfft1d_impl(x, n=None, axis=-1, overwrite_arg=False, direction=+1): in_place = 1 if in_place: - with _lock: - if x_type is cnp.NPY_DOUBLE: - if dir_ < 0: - status = double_mkl_irfft_in(x_arr, n_, axis_) - else: - status = double_mkl_rfft_in(x_arr, n_, axis_) - elif x_type is cnp.NPY_FLOAT: - if dir_ < 0: - status = float_mkl_irfft_in(x_arr, n_, axis_) - else: - status = float_mkl_rfft_in(x_arr, n_, axis_) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + if x_type is cnp.NPY_DOUBLE: + if dir_ < 0: + status = double_mkl_irfft_in(x_arr, n_, axis_, _cache) + else: + status = double_mkl_rfft_in(x_arr, n_, axis_, _cache) + elif x_type is cnp.NPY_FLOAT: + if dir_ < 0: + status = float_mkl_irfft_in(x_arr, n_, axis_, _cache) else: - status = 1 + status = float_mkl_rfft_in(x_arr, n_, axis_, _cache) + else: + status = 1 if status: c_error_msg = mkl_dfti_error(status) @@ -443,17 +484,18 @@ def _rrfft1d_impl(x, n=None, axis=-1, overwrite_arg=False, direction=+1): f_arr = __allocate_result(x_arr, n_, axis_, x_type); # call out-of-place FFT - with _lock: - if x_type is cnp.NPY_DOUBLE: - if dir_ < 0: - status = double_double_mkl_irfft_out(x_arr, n_, axis_, f_arr) - else: - status = double_double_mkl_rfft_out(x_arr, n_, axis_, f_arr) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + if x_type is cnp.NPY_DOUBLE: + if dir_ < 0: + status = double_double_mkl_irfft_out(x_arr, n_, axis_, f_arr, _cache) else: - if dir_ < 0: - status = float_float_mkl_irfft_out(x_arr, n_, axis_, f_arr) - else: - status = float_float_mkl_rfft_out(x_arr, n_, axis_, f_arr) + status = double_double_mkl_rfft_out(x_arr, n_, axis_, f_arr, _cache) + else: + if dir_ < 0: + status = float_float_mkl_irfft_out(x_arr, n_, axis_, f_arr, _cache) + else: + status = float_float_mkl_rfft_out(x_arr, n_, axis_, f_arr, _cache) if (status): c_error_msg = mkl_dfti_error(status) @@ -479,6 +521,7 @@ def _rc_fft1d_impl(x, n=None, axis=-1, overwrite_arg=False): cdef int direction = 1 # dummy, only used for the sake of arg-processing cdef char * c_error_msg = NULL cdef bytes py_error_msg + cdef DftiCache *_cache x_arr = __process_arguments(x, n, axis, overwrite_arg, direction, &axis_, &n_, &in_place, &xnd, &dir_, 1) @@ -509,11 +552,13 @@ def _rc_fft1d_impl(x, n=None, axis=-1, overwrite_arg=False): # call out-of-place FFT if x_type is cnp.NPY_FLOAT: - with _lock: - status = float_cfloat_mkl_fft1d_out(x_arr, n_, axis_, f_arr, HALF_HARMONICS) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + status = float_cfloat_mkl_fft1d_out(x_arr, n_, axis_, f_arr, HALF_HARMONICS, _cache) else: - with _lock: - status = double_cdouble_mkl_fft1d_out(x_arr, n_, axis_, f_arr, HALF_HARMONICS) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + status = double_cdouble_mkl_fft1d_out(x_arr, n_, axis_, f_arr, HALF_HARMONICS, _cache) if (status): c_error_msg = mkl_dfti_error(status) @@ -553,6 +598,7 @@ def _rc_ifft1d_impl(x, n=None, axis=-1, overwrite_arg=False): cdef int direction = 1 # dummy, only used for the sake of arg-processing cdef char * c_error_msg = NULL cdef bytes py_error_msg + cdef DftiCache *_cache int_n = _is_integral(n) # nn gives the number elements along axis of the input that we use @@ -591,11 +637,13 @@ def _rc_ifft1d_impl(x, n=None, axis=-1, overwrite_arg=False): # call out-of-place FFT if x_type is cnp.NPY_CFLOAT: - with _lock: - status = cfloat_float_mkl_irfft_out(x_arr, n_, axis_, f_arr) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + status = cfloat_float_mkl_irfft_out(x_arr, n_, axis_, f_arr, _cache) else: - with _lock: - status = cdouble_double_mkl_irfft_out(x_arr, n_, axis_, f_arr) + _cache_capsule = _tls_dfti_cache_capsule() + _cache = cpython.pycapsule.PyCapsule_GetPointer(_cache_capsule, capsule_name) + status = cdouble_double_mkl_irfft_out(x_arr, n_, axis_, f_arr, _cache) if (status): c_error_msg = mkl_dfti_error(status) diff --git a/mkl_fft/src/mklfft.c.src b/mkl_fft/src/mklfft.c.src index 7f9e5b3..4e97a72 100644 --- a/mkl_fft/src/mklfft.c.src +++ b/mkl_fft/src/mklfft.c.src @@ -94,25 +94,16 @@ static NPY_INLINE void get_basic_array_data( Routines for working with the cached FFT descriptor * ============================================================================= */ -typedef struct DftiCache { - DFTI_DESCRIPTOR_HANDLE hand; - int initialized; -} DftiCache; - - -/* make sure to initialize these */ -static struct DftiCache dftiCache = {NULL, NO}; - - -static void freeCache(void) -{ - if(dftiCache.initialized && dftiCache.hand) { - MKL_LONG status = DftiFreeDescriptor(&dftiCache.hand); +int _free_dfti_cache(DftiCache *dfti_cache) { + if(dfti_cache->initialized && dfti_cache->hand) { + MKL_LONG status = DftiFreeDescriptor(&(dfti_cache->hand)); _debug_print("Descriptor freed: %ld\n", status); - assert(status == 0); + return (int) status; } + + return 0; } /**begin repeat @@ -121,31 +112,31 @@ static void freeCache(void) * #sc_t=float*2,double*2# */ static MKL_LONG -__create_descriptor_1d_@prec@_@domain@(MKL_LONG len, @sc_t@ fsc, @sc_t@ bsc) +__create_descriptor_1d_@prec@_@domain@(MKL_LONG len, @sc_t@ fsc, @sc_t@ bsc, DftiCache *dfti_cache) { MKL_LONG status = 0; - if (dftiCache.initialized && dftiCache.hand) { + if (dfti_cache->initialized && dfti_cache->hand) { enum DFTI_CONFIG_VALUE cached_prec, cached_dom; MKL_LONG cached_rank, cached_len; @sc_t@ cached_sc; - status = DftiGetValue(dftiCache.hand, DFTI_DIMENSION, &cached_rank); + status = DftiGetValue(dfti_cache->hand, DFTI_DIMENSION, &cached_rank); if (0 != status || cached_rank != 1) goto reallocate; - status = DftiGetValue(dftiCache.hand, DFTI_PRECISION, &cached_prec); + status = DftiGetValue(dfti_cache->hand, DFTI_PRECISION, &cached_prec); if (0 != status || cached_prec != @prec@) goto reallocate; - status = DftiGetValue(dftiCache.hand, DFTI_FORWARD_DOMAIN, &cached_dom); + status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_DOMAIN, &cached_dom); if (0 != status || cached_dom != @domain@) goto reallocate; - status = DftiGetValue(dftiCache.hand, DFTI_LENGTHS, &cached_len); + status = DftiGetValue(dfti_cache->hand, DFTI_LENGTHS, &cached_len); if (0 != status || cached_len != len) goto reallocate; - status = DftiGetValue(dftiCache.hand, DFTI_FORWARD_SCALE, &cached_sc); + status = DftiGetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, &cached_sc); if (0 != status || cached_sc != fsc) goto set_scales; - status = DftiGetValue(dftiCache.hand, DFTI_BACKWARD_SCALE, &cached_sc); + status = DftiGetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, &cached_sc); if (0 != status || cached_sc != bsc) goto set_scales; return status; @@ -154,28 +145,25 @@ __create_descriptor_1d_@prec@_@domain@(MKL_LONG len, @sc_t@ fsc, @sc_t@ bsc) allocate_new: status = DftiCreateDescriptor( - &dftiCache.hand, + &(dfti_cache->hand), @prec@, @domain@, 1, len ); - if (!dftiCache.initialized) { - atexit(freeCache); - } - dftiCache.initialized = (status == 0) ? 1 : 0; + dfti_cache->initialized = (status == 0) ? 1 : 0; set_scales: - status = DftiSetValue(dftiCache.hand, DFTI_FORWARD_SCALE, fsc); + status = DftiSetValue(dfti_cache->hand, DFTI_FORWARD_SCALE, fsc); if (0 != status) return status; - status = DftiSetValue(dftiCache.hand, DFTI_BACKWARD_SCALE, bsc); + status = DftiSetValue(dfti_cache->hand, DFTI_BACKWARD_SCALE, bsc); return status; reallocate: - if(dftiCache.hand) { - status = DftiFreeDescriptor(&dftiCache.hand); + if(dfti_cache->hand) { + status = DftiFreeDescriptor(&(dfti_cache->hand)); assert(status == 0); } @@ -193,18 +181,18 @@ __longp_equal_1d(MKL_LONG* a, MKL_LONG* b) } static MKL_LONG -__set_descriptor_1d_value_longp(enum DFTI_CONFIG_PARAM par, MKL_LONG *val) +__set_descriptor_1d_value_longp(enum DFTI_CONFIG_PARAM par, MKL_LONG *val, DftiCache *dfti_cache) { MKL_LONG status = 0; MKL_LONG cached_val[2] = {0,0}; - assert(dftiCache.initialized && dftiCache.hand); + assert(dfti_cache->initialized && dfti_cache->hand); - status = DftiGetValue(dftiCache.hand, par, cached_val); + status = DftiGetValue(dfti_cache->hand, par, cached_val); if (0 == status && __longp_equal_1d(cached_val, val)) return status; - status = DftiSetValue(dftiCache.hand, par, val); + status = DftiSetValue(dfti_cache->hand, par, val); return status; } @@ -214,36 +202,36 @@ __set_descriptor_1d_value_longp(enum DFTI_CONFIG_PARAM par, MKL_LONG *val) * #type=MKL_LONG,enum DFTI_CONFIG_VALUE# */ static MKL_LONG -__set_descriptor_1d_value_@TYPE_NAME@(enum DFTI_CONFIG_PARAM par, @type@ val) +__set_descriptor_1d_value_@TYPE_NAME@(enum DFTI_CONFIG_PARAM par, @type@ val, DftiCache *dfti_cache) { MKL_LONG status = 0; @type@ cached_val; - assert(dftiCache.initialized && dftiCache.hand); + assert(dfti_cache->initialized && dfti_cache->hand); - status = DftiGetValue(dftiCache.hand, par, &cached_val); + status = DftiGetValue(dfti_cache->hand, par, &cached_val); if (0 == status && cached_val == val) return status; - status = DftiSetValue(dftiCache.hand, par, val); + status = DftiSetValue(dfti_cache->hand, par, val); return status; } /**end repeat**/ static MKL_LONG -__commit_descriptor_1d(void) +__commit_descriptor_1d(DftiCache *dfti_cache) { MKL_LONG status = 0; enum DFTI_CONFIG_VALUE cached_committed; - assert(dftiCache.initialized && dftiCache.hand); - status = DftiGetValue(dftiCache.hand, DFTI_COMMIT_STATUS, &cached_committed); + assert(dfti_cache->initialized && dfti_cache->hand); + status = DftiGetValue(dfti_cache->hand, DFTI_COMMIT_STATUS, &cached_committed); if(0 == status && cached_committed == DFTI_COMMITTED) return status; - status = DftiCommitDescriptor(dftiCache.hand); + status = DftiCommitDescriptor(dfti_cache->hand); return status; } @@ -253,13 +241,13 @@ __commit_descriptor_1d(void) * #DftiCompute_MODE=(DftiComputeForward)*4,(DftiComputeBackward)*4# */ static NPY_INLINE MKL_LONG -__cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@(@MKL_TYPE@ *x) +__cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@(@MKL_TYPE@ *x, DftiCache *dfti_cache) { MKL_LONG status = 0; - assert(dftiCache.initialized && dftiCache.hand); + assert(dfti_cache->initialized && dfti_cache->hand); Py_BEGIN_ALLOW_THREADS - status = @DftiCompute_MODE@(dftiCache.hand, x); + status = @DftiCompute_MODE@(dfti_cache->hand, x); Py_END_ALLOW_THREADS return status; @@ -273,13 +261,13 @@ __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@(@MKL_TYPE@ *x) */ static NPY_INLINE MKL_LONG __cached_notinplace_@DftiCompute_MODE@_@MKL_IN_TYPE@_@MKL_OUT_TYPE@( - @MKL_IN_TYPE@ *x_in, @MKL_OUT_TYPE@ *x_out) + @MKL_IN_TYPE@ *x_in, @MKL_OUT_TYPE@ *x_out, DftiCache *dfti_cache) { MKL_LONG status = 0; - assert(dftiCache.initialized && dftiCache.hand); + assert(dfti_cache->initialized && dfti_cache->hand); Py_BEGIN_ALLOW_THREADS - status = @DftiCompute_MODE@(dftiCache.hand, x_in, x_out); + status = @DftiCompute_MODE@(dfti_cache->hand, x_in, x_out); Py_END_ALLOW_THREADS return status; @@ -468,7 +456,7 @@ compute_strides_and_distances_inout( * #mode=(fft1d)*2,(ifft1d)*2# * #DftiCompute_MODE=(DftiComputeForward)*2,(DftiComputeBackward)*2# */ -int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) +int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis, DftiCache* dfti_cache) { MKL_LONG status = 0, input_distance = 0, input_number_of_transforms = 1; @@ -506,44 +494,44 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) &input_number_of_transforms, &input_distance); status = __create_descriptor_1d_@DFTI_PRECISION@_DFTI_COMPLEX( - _to_mkl_long(n), 1.0, 1.0/n); + _to_mkl_long(n), 1.0, 1.0/n, dfti_cache); if (status != 0) goto failed; /* these must be always set, since previous cached element may have had different values */ - status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE); + status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX); + DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_longp( - DFTI_INPUT_STRIDES, input_strides); + DFTI_INPUT_STRIDES, input_strides, dfti_cache); if (status != 0) goto failed; if (input_number_of_transforms > 1) { status = __set_descriptor_1d_value_long(DFTI_NUMBER_OF_TRANSFORMS, - input_number_of_transforms); + input_number_of_transforms, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_INPUT_DISTANCE, input_distance); + DFTI_INPUT_DISTANCE, input_distance, dfti_cache); if (status != 0) goto failed; } else { /* it is important to set the number of transforms for cached descriptor */ status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; } - status = __commit_descriptor_1d(); + status = __commit_descriptor_1d(dfti_cache); if (status != 0) goto failed; if (single_DftiCompute){ - status = __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@(x_data); + status = __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@(x_data, dfti_cache); if (status != 0) goto failed; } else { multi_iter_masked_t mit; @@ -567,7 +555,7 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) tmp += x_strides[i] * MultiIter_IndexElem(mit, i); status = __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@( - (@MKL_TYPE@*) tmp); + (@MKL_TYPE@*) tmp, dfti_cache); if (status != 0) break; if (multi_iter_masked_next(&mit)) @@ -609,7 +597,7 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) * #vml_conj_func=(vmcConj,vmzConj)*2# */ int @REALIN@_@COMPLEXOUT@_mkl_@mode@_out( - PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out, int all_harmonics) + PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out, int all_harmonics, DftiCache* dfti_cache) { MKL_LONG status = 0, input_distance = 0, output_distance = 0, input_number_of_transforms = 1; @@ -694,46 +682,46 @@ int @REALIN@_@COMPLEXOUT@_mkl_@mode@_out( backward_scale = 1.0/n; } status = __create_descriptor_1d_@DFTI_PRECISION@_DFTI_REAL( - _to_mkl_long(n), forward_scale, backward_scale); + _to_mkl_long(n), forward_scale, backward_scale, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides); + status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_NOT_INPLACE); + status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_longp(DFTI_OUTPUT_STRIDES, output_strides); + status = __set_descriptor_1d_value_longp(DFTI_OUTPUT_STRIDES, output_strides, dfti_cache); if (status != 0) goto failed; if (input_number_of_transforms > 1) { status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance); + status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_OUTPUT_DISTANCE, output_distance); + DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache); if (status != 0) goto failed; } else { status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; } - status = __commit_descriptor_1d(); + status = __commit_descriptor_1d(dfti_cache); if (status != 0) goto failed; if (single_DftiCompute){ status = __cached_notinplace_DftiComputeForward_@MKL_IN_TYPE@_@MKL_OUT_TYPE@( - xin_data, xout_data); + xin_data, xout_data, dfti_cache); if (status != 0) goto failed; } else { multi_iter_masked_t mit; @@ -761,7 +749,7 @@ int @REALIN@_@COMPLEXOUT@_mkl_@mode@_out( } status = __cached_notinplace_DftiComputeForward_@MKL_IN_TYPE@_@MKL_OUT_TYPE@( - (@MKL_IN_TYPE@*) tmp1, (@MKL_OUT_TYPE@*) tmp2 ); + (@MKL_IN_TYPE@*) tmp1, (@MKL_OUT_TYPE@*) tmp2, dfti_cache); if (status != 0) break; if (multi_iter_masked_next(&mit)) @@ -875,7 +863,7 @@ int @REALIN@_@COMPLEXOUT@_mkl_@mode@_out( * #DftiCompute_MODE=(DftiComputeForward)*2,(DftiComputeBackward)*2# */ int @COMPLEXIN@_@COMPLEXOUT@_mkl_@mode@_out( - PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out) + PyArrayObject *x_in, npy_intp n, int axis, PyArrayObject *x_out, DftiCache *dfti_cache) { MKL_LONG status = 0, input_distance = 0, output_distance = 0, input_number_of_transforms = 1; @@ -944,52 +932,52 @@ int @COMPLEXIN@_@COMPLEXOUT@_mkl_@mode@_out( ); status = __create_descriptor_1d_@DFTI_PRECISION@_DFTI_COMPLEX( - _to_mkl_long(n), 1.0, 1.0/n); + _to_mkl_long(n), 1.0, 1.0/n, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX); + DFTI_COMPLEX_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_longp( - DFTI_INPUT_STRIDES, input_strides); + DFTI_INPUT_STRIDES, input_strides, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_PLACEMENT, DFTI_NOT_INPLACE); + DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_longp( - DFTI_OUTPUT_STRIDES, output_strides); + DFTI_OUTPUT_STRIDES, output_strides, dfti_cache); if (status != 0) goto failed; if (input_number_of_transforms > 1) { status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_INPUT_DISTANCE, input_distance); + DFTI_INPUT_DISTANCE, input_distance, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_OUTPUT_DISTANCE, output_distance); + DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache); if (status != 0) goto failed; } else { assert(input_number_of_transforms == 1); status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; } - status = __commit_descriptor_1d(); + status = __commit_descriptor_1d(dfti_cache); if (status != 0) goto failed; if (single_DftiCompute){ status = __cached_notinplace_@DftiCompute_MODE@_@MKL_TYPE@_@MKL_TYPE@( - xin_data, xout_data); + xin_data, xout_data, dfti_cache); if (status != 0) goto failed; } else { multi_iter_masked_t mit; @@ -1017,7 +1005,7 @@ int @COMPLEXIN@_@COMPLEXOUT@_mkl_@mode@_out( } status = __cached_notinplace_@DftiCompute_MODE@_@MKL_TYPE@_@MKL_TYPE@( - (@MKL_TYPE@*) tmp1, (@MKL_TYPE@*) tmp2); + (@MKL_TYPE@*) tmp1, (@MKL_TYPE@*) tmp2, dfti_cache); if (status != 0) break; if (multi_iter_masked_next(&mit)) @@ -1056,7 +1044,7 @@ int @COMPLEXIN@_@COMPLEXOUT@_mkl_@mode@_out( * #mode=(rfft)*2,(irfft)*2# * #DftiCompute_MODE=(DftiComputeForward)*2,(DftiComputeBackward)*2# */ -int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) +int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis, DftiCache *dfti_cache) { MKL_LONG status = 0, input_distance = 0, input_number_of_transforms = 1; @@ -1093,44 +1081,44 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) x_rank, x_shape, x_strides, x_itemsize, x_size, axis, &input_number_of_transforms, &input_distance); - status = __create_descriptor_1d_@DFTI_PRECISION@_DFTI_REAL(_to_mkl_long(n), 1.0, 1.0/n); + status = __create_descriptor_1d_@DFTI_PRECISION@_DFTI_REAL(_to_mkl_long(n), 1.0, 1.0/n, dfti_cache); if (status != 0) goto failed; /* these must be always set, since previous cached element may have had different values */ - status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE); + status = __set_descriptor_1d_value_enum(DFTI_PLACEMENT, DFTI_INPLACE, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_REAL); + DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_REAL, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_enum(DFTI_PACKED_FORMAT, DFTI_PACK_FORMAT); + status = __set_descriptor_1d_value_enum(DFTI_PACKED_FORMAT, DFTI_PACK_FORMAT, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides); + status = __set_descriptor_1d_value_longp(DFTI_INPUT_STRIDES, input_strides, dfti_cache); if (status != 0) goto failed; if (input_number_of_transforms > 1) { status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance); + status = __set_descriptor_1d_value_long(DFTI_INPUT_DISTANCE, input_distance, dfti_cache); if (status != 0) goto failed; } else { /* it is important to set the number of transforms for cached descriptor */ status = __set_descriptor_1d_value_long(DFTI_NUMBER_OF_TRANSFORMS, - input_number_of_transforms); + input_number_of_transforms, dfti_cache); if (status != 0) goto failed; } - status = __commit_descriptor_1d(); + status = __commit_descriptor_1d(dfti_cache); if (status != 0) goto failed; if (single_DftiCompute){ - status = __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@(x_data); + status = __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@(x_data, dfti_cache); if (status != 0) goto failed; } else { multi_iter_masked_t mit; @@ -1153,7 +1141,7 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) for(tmp = (char *) x_data, i = 0; i < x_rank; i++) tmp += x_strides[i] * MultiIter_IndexElem(mit, i); - status = __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@((@MKL_TYPE@*) tmp); + status = __cached_inplace_@DftiCompute_MODE@_@MKL_TYPE@((@MKL_TYPE@*) tmp, dfti_cache); if (status != 0) break; if (multi_iter_masked_next(&mit)) @@ -1184,7 +1172,7 @@ int @name@_mkl_@mode@_in(PyArrayObject* x_inout, npy_intp n, int axis) /* n here is the length of the output along the axis */ int @namein@_@nameout@_mkl_irfft_out( - PyArrayObject* x_in, npy_intp n, int axis, PyArrayObject* x_out) + PyArrayObject* x_in, npy_intp n, int axis, PyArrayObject* x_out, DftiCache *dfti_cache) { MKL_LONG status = 0, input_distance = 0, output_distance = 0, input_number_of_transforms = 1; @@ -1250,52 +1238,52 @@ int ); status = __create_descriptor_1d_@DFTI_PRECISION@_DFTI_REAL( - _to_mkl_long(n), 1.0, 1.0/n); + _to_mkl_long(n), 1.0, 1.0/n, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); + DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_longp( - DFTI_INPUT_STRIDES, input_strides); + DFTI_INPUT_STRIDES, input_strides, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_PLACEMENT, DFTI_NOT_INPLACE); + DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_longp( - DFTI_OUTPUT_STRIDES, output_strides); + DFTI_OUTPUT_STRIDES, output_strides, dfti_cache); if (status != 0) goto failed; if (input_number_of_transforms > 1) { status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_INPUT_DISTANCE, input_distance); + DFTI_INPUT_DISTANCE, input_distance, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_OUTPUT_DISTANCE, output_distance); + DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache); if (status != 0) goto failed; } else { assert(input_number_of_transforms == 1); status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; } - status = __commit_descriptor_1d(); + status = __commit_descriptor_1d(dfti_cache); if (status != 0) goto failed; if (single_DftiCompute){ status = __cached_notinplace_DftiComputeBackward_@MKL_IN_TYPE@_@MKL_OUT_TYPE@( - xin_data, xout_data); + xin_data, xout_data, dfti_cache); if (status != 0) goto failed; } else { multi_iter_masked_t mit; @@ -1323,7 +1311,7 @@ int } status = __cached_notinplace_DftiComputeBackward_@MKL_IN_TYPE@_@MKL_OUT_TYPE@( - (@MKL_IN_TYPE@*) tmp1, (@MKL_OUT_TYPE@*) tmp2); + (@MKL_IN_TYPE@*) tmp1, (@MKL_OUT_TYPE@*) tmp2, dfti_cache); if (status != 0) break; if (multi_iter_masked_next(&mit)) @@ -1358,7 +1346,7 @@ int * #DftiCompute_MODE=(DftiComputeForward)*2,(DftiComputeBackward)*2# */ int @name@_@name@_mkl_@mode@_out( - PyArrayObject* x_in, npy_intp n, int axis, PyArrayObject *x_out) + PyArrayObject* x_in, npy_intp n, int axis, PyArrayObject *x_out, DftiCache *dfti_cache) { MKL_LONG status = 0, input_distance = 0, output_distance = 0, input_number_of_transforms = 1; @@ -1428,55 +1416,55 @@ int @name@_@name@_mkl_@mode@_out( ); status = __create_descriptor_1d_@DFTI_PRECISION@_DFTI_REAL( - _to_mkl_long(n), 1.0, 1.0/n); + _to_mkl_long(n), 1.0, 1.0/n, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_REAL); + DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_REAL, dfti_cache); if (status != 0) goto failed; - status = __set_descriptor_1d_value_enum(DFTI_PACKED_FORMAT, DFTI_PACK_FORMAT); + status = __set_descriptor_1d_value_enum(DFTI_PACKED_FORMAT, DFTI_PACK_FORMAT, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_longp( - DFTI_INPUT_STRIDES, input_strides); + DFTI_INPUT_STRIDES, input_strides, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_enum( - DFTI_PLACEMENT, DFTI_NOT_INPLACE); + DFTI_PLACEMENT, DFTI_NOT_INPLACE, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_longp( - DFTI_OUTPUT_STRIDES, output_strides); + DFTI_OUTPUT_STRIDES, output_strides, dfti_cache); if (status != 0) goto failed; if (input_number_of_transforms > 1) { status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_INPUT_DISTANCE, input_distance); + DFTI_INPUT_DISTANCE, input_distance, dfti_cache); if (status != 0) goto failed; status = __set_descriptor_1d_value_long( - DFTI_OUTPUT_DISTANCE, output_distance); + DFTI_OUTPUT_DISTANCE, output_distance, dfti_cache); if (status != 0) goto failed; } else { assert(input_number_of_transforms == 1); status = __set_descriptor_1d_value_long( - DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms); + DFTI_NUMBER_OF_TRANSFORMS, input_number_of_transforms, dfti_cache); if (status != 0) goto failed; } - status = __commit_descriptor_1d(); + status = __commit_descriptor_1d(dfti_cache); if (status != 0) goto failed; if (single_DftiCompute){ status = __cached_notinplace_@DftiCompute_MODE@_@MKL_TYPE@_@MKL_TYPE@( - xin_data, xout_data); + xin_data, xout_data, dfti_cache); if (status != 0) goto failed; } else { multi_iter_masked_t mit; @@ -1504,7 +1492,7 @@ int @name@_@name@_mkl_@mode@_out( } status = __cached_notinplace_@DftiCompute_MODE@_@MKL_TYPE@_@MKL_TYPE@( - (@MKL_TYPE@*) tmp1, (@MKL_TYPE@*) tmp2); + (@MKL_TYPE@*) tmp1, (@MKL_TYPE@*) tmp2, dfti_cache); if (status != 0) break; if (multi_iter_masked_next(&mit)) diff --git a/mkl_fft/src/mklfft.h b/mkl_fft/src/mklfft.h index 45f68aa..e53f66d 100644 --- a/mkl_fft/src/mklfft.h +++ b/mkl_fft/src/mklfft.h @@ -26,41 +26,48 @@ */ #include "mkl.h" +typedef struct DftiCache { + DFTI_DESCRIPTOR_HANDLE hand; + int initialized; +} DftiCache; + +extern int _free_dfti_cache(DftiCache *); + /* Complex input, in-place */ -extern int cdouble_mkl_fft1d_in(PyArrayObject*, npy_intp, int); -extern int cfloat_mkl_fft1d_in(PyArrayObject*, npy_intp, int); -extern int cdouble_mkl_ifft1d_in(PyArrayObject*, npy_intp, int); -extern int cfloat_mkl_ifft1d_in(PyArrayObject*, npy_intp, int); +extern int cdouble_mkl_fft1d_in(PyArrayObject*, npy_intp, int, DftiCache*); +extern int cfloat_mkl_fft1d_in(PyArrayObject*, npy_intp, int, DftiCache*); +extern int cdouble_mkl_ifft1d_in(PyArrayObject*, npy_intp, int, DftiCache*); +extern int cfloat_mkl_ifft1d_in(PyArrayObject*, npy_intp, int, DftiCache*); /* Complex input/output, out-of-place */ -extern int cfloat_cfloat_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*); -extern int cdouble_cdouble_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*); -extern int cfloat_cfloat_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*); -extern int cdouble_cdouble_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*); +extern int cfloat_cfloat_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); +extern int cdouble_cdouble_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); +extern int cfloat_cfloat_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); +extern int cdouble_cdouble_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); /* Real input, complex output, out-of-place */ -extern int float_cfloat_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int); -extern int double_cdouble_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int); -extern int float_cfloat_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int); -extern int double_cdouble_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int); +extern int float_cfloat_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int, DftiCache*); +extern int double_cdouble_mkl_fft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int, DftiCache*); +extern int float_cfloat_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int, DftiCache*); +extern int double_cdouble_mkl_ifft1d_out(PyArrayObject*, npy_intp, int, PyArrayObject*, int, DftiCache*); /* Real input, real output, in-place */ -extern int float_mkl_rfft_in(PyArrayObject*, npy_intp, int); -extern int float_mkl_irfft_in(PyArrayObject*, npy_intp, int); +extern int float_mkl_rfft_in(PyArrayObject*, npy_intp, int, DftiCache*); +extern int float_mkl_irfft_in(PyArrayObject*, npy_intp, int, DftiCache*); -extern int double_mkl_rfft_in(PyArrayObject*, npy_intp, int); -extern int double_mkl_irfft_in(PyArrayObject*, npy_intp, int); +extern int double_mkl_rfft_in(PyArrayObject*, npy_intp, int, DftiCache*); +extern int double_mkl_irfft_in(PyArrayObject*, npy_intp, int, DftiCache*); /* Real input, real output, out-of-place */ -extern int float_float_mkl_rfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*); -extern int float_float_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*); +extern int float_float_mkl_rfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); +extern int float_float_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); -extern int double_double_mkl_rfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*); -extern int double_double_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*); +extern int double_double_mkl_rfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); +extern int double_double_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); /* Complex input. real output, out-of-place */ -extern int cdouble_double_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*); -extern int cfloat_float_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*); +extern int cdouble_double_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); +extern int cfloat_float_mkl_irfft_out(PyArrayObject*, npy_intp, int, PyArrayObject*, DftiCache*); /* Complex, ND, in-place */ extern int cdouble_cdouble_mkl_fftnd_in(PyArrayObject*);