diff --git a/mkl_fft/__init__.py b/mkl_fft/__init__.py
index ccf08e5..5cfada7 100644
--- a/mkl_fft/__init__.py
+++ b/mkl_fft/__init__.py
@@ -25,9 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from . import _init_helper
-
-# pylint: disable=no-name-in-module
-from ._pydfti import (
+from ._mkl_fft import (
     fft,
     fft2,
     fftn,
@@ -37,12 +35,11 @@
     irfft,
     irfft2,
     irfftn,
-    irfftpack,
     rfft,
     rfft2,
     rfftn,
-    rfftpack,
 )
+from ._pydfti import irfftpack, rfftpack  # pylint: disable=no-name-in-module
 from ._version import __version__
 
 import mkl_fft.interfaces  # isort: skip
diff --git a/mkl_fft/_fft_utils.py b/mkl_fft/_fft_utils.py
index e4e9189..8a6189e 100644
--- a/mkl_fft/_fft_utils.py
+++ b/mkl_fft/_fft_utils.py
@@ -26,7 +26,14 @@
 
 import numpy as np
 
-__all__ = ["_check_norm", "_compute_fwd_scale", "_swap_direction"]
+# pylint: disable=no-name-in-module
+from ._pydfti import (
+    _c2c_fft1d_impl,
+    _c2r_fft1d_impl,
+    _direct_fftnd,
+    _r2c_fft1d_impl,
+    _validate_out_array,
+)
 
 
 def _check_norm(norm):
@@ -37,6 +44,26 @@ def _check_norm(norm):
         )
 
 
+def _check_shapes_for_direct(xs, shape, axes):
+    if len(axes) > 7:  # Intel MKL supports up to 7D
+        return False
+    if not (len(xs) == len(shape)):
+        # full-dimensional transform
+        return False
+    if not (len(set(axes)) == len(axes)):
+        # repeated axes
+        return False
+    for xsi, ai in zip(xs, axes):
+        try:
+            sh_ai = shape[ai]
+        except IndexError:
+            raise ValueError("Invalid axis (%d) specified" % ai)
+
+        if not (xsi == sh_ai):
+            return False
+    return True
+
+
 def _compute_fwd_scale(norm, n, shape):
     _check_norm(norm)
     if norm in (None, "backward"):
@@ -51,6 +78,272 @@ def _compute_fwd_scale(norm, n, shape):
         return np.sqrt(fsc)
 
 
+def _cook_nd_args(a, s=None, axes=None, invreal=False):
+    if s is None:
+        shapeless = True
+        if axes is None:
+            s = list(a.shape)
+        else:
+            try:
+                s = [a.shape[i] for i in axes]
+            except IndexError:
+                # fake s designed to trip the ValueError further down
+                s = range(len(axes) + 1)
+                pass
+    else:
+        shapeless = False
+    s = list(s)
+    if axes is None:
+        axes = list(range(-len(s), 0))
+    if len(s) != len(axes):
+        raise ValueError("Shape and axes have different lengths.")
+    if invreal and shapeless:
+        s[-1] = (a.shape[axes[-1]] - 1) * 2
+    return s, axes
+
+
+# copied from scipy.fft module
+# https://github.com/scipy/scipy/blob/main/scipy/fft/_pocketfft/helper.py
+def _datacopied(arr, original):
+    """
+    Strict check for `arr` not sharing any data with `original`,
+    under the assumption that arr = np.asarray(original).
+    """
+    if arr is original:
+        return False
+    if not isinstance(original, np.ndarray) and hasattr(original, "__array__"):
+        return False
+    return arr.base is None
+
+
+def _flat_to_multi(ind, shape):
+    nd = len(shape)
+    m_ind = [-1] * nd
+    j = ind
+    for i in range(nd):
+        si = shape[nd - 1 - i]
+        q = j // si
+        r = j - si * q
+        m_ind[nd - 1 - i] = r
+        j = q
+    return m_ind
+
+
+# copied from scipy.fftpack.helper
+def _init_nd_shape_and_axes(x, shape, axes):
+    """Handle shape and axes arguments for n-dimensional transforms.
+    Returns the shape and axes in a standard form, taking into account negative
+    values and checking for various potential errors.
+    Parameters
+    ----------
+    x : array_like
+        The input array.
+    shape : int or array_like of ints or None
+        The shape of the result.  If both `shape` and `axes` (see below) are
+        None, `shape` is ``x.shape``; if `shape` is None but `axes` is
+        not None, then `shape` is ``scipy.take(x.shape, axes, axis=0)``.
+        If `shape` is -1, the size of the corresponding dimension of `x` is
+        used.
+    axes : int or array_like of ints or None
+        Axes along which the calculation is computed.
+        The default is over all axes.
+        Negative indices are automatically converted to their positive
+        counterpart.
+    Returns
+    -------
+    shape : array
+        The shape of the result. It is a 1D integer array.
+    axes : array
+        The shape of the result. It is a 1D integer array.
+    """
+    x = np.asarray(x)
+    noshape = shape is None
+    noaxes = axes is None
+
+    if noaxes:
+        axes = np.arange(x.ndim, dtype=np.intc)
+    else:
+        axes = np.atleast_1d(axes)
+
+    if axes.size == 0:
+        axes = axes.astype(np.intc)
+
+    if not axes.ndim == 1:
+        raise ValueError("when given, axes values must be a scalar or vector")
+    if not np.issubdtype(axes.dtype, np.integer):
+        raise ValueError("when given, axes values must be integers")
+
+    axes = np.where(axes < 0, axes + x.ndim, axes)
+
+    if axes.size != 0 and (axes.max() >= x.ndim or axes.min() < 0):
+        raise ValueError("axes exceeds dimensionality of input")
+    if axes.size != 0 and np.unique(axes).shape != axes.shape:
+        raise ValueError("all axes must be unique")
+
+    if not noshape:
+        shape = np.atleast_1d(shape)
+    elif np.isscalar(x):
+        shape = np.array([], dtype=np.intc)
+    elif noaxes:
+        shape = np.array(x.shape, dtype=np.intc)
+    else:
+        shape = np.take(x.shape, axes)
+
+    if shape.size == 0:
+        shape = shape.astype(np.intc)
+
+    if shape.ndim != 1:
+        raise ValueError("when given, shape values must be a scalar or vector")
+    if not np.issubdtype(shape.dtype, np.integer):
+        raise ValueError("when given, shape values must be integers")
+    if axes.shape != shape.shape:
+        raise ValueError(
+            "when given, axes and shape arguments"
+            " have to be of the same length"
+        )
+
+    shape = np.where(shape == -1, np.array(x.shape)[axes], shape)
+
+    if shape.size != 0 and (shape < 1).any():
+        raise ValueError(
+            "invalid number of data points ({0}) specified".format(shape)
+        )
+
+    return shape, axes
+
+
+def _iter_complementary(x, axes, func, kwargs, result):
+    if axes is None:
+        # s and axes are None, direct N-D FFT
+        return func(x, **kwargs, out=result)
+    x_shape = x.shape
+    nd = x.ndim
+    r = list(range(nd))
+    sl = [slice(None, None, None)] * nd
+    if not np.iterable(axes):
+        axes = (axes,)
+    for ai in axes:
+        r[ai] = None
+    size = 1
+    sub_shape = []
+    dual_ind = []
+    for ri in r:
+        if ri is not None:
+            size *= x_shape[ri]
+            sub_shape.append(x_shape[ri])
+            dual_ind.append(ri)
+
+    for ind in range(size):
+        m_ind = _flat_to_multi(ind, sub_shape)
+        for k1, k2 in zip(dual_ind, m_ind):
+            sl[k1] = k2
+        if np.issubdtype(x.dtype, np.complexfloating):
+            func(x[tuple(sl)], **kwargs, out=result[tuple(sl)])
+        else:
+            # For c2c FFT, if the input is real, half of the output is the
+            # complex conjugate of the other half. Instead of upcasting the
+            # input to complex and performing c2c FFT, we perform rfft and then
+            # construct the other half using the first half.
+            # However, when using the `out` keyword here I encountered a result
+            # in tests/third_party/scipy/test_basic.py::test_fft_with_order
+            # that was correct but the elements were not necessarily similar to
+            # NumPy. For example, an element in the first half of mkl_fft output
+            # array appeared in the second half of the NumPy output array,
+            # while the equivalent element in the NumPy array was the conjugate
+            # of the mkl_fft output array.
+            np.copyto(result[tuple(sl)], func(x[tuple(sl)], **kwargs))
+
+    return result
+
+
+def _iter_fftnd(
+    a,
+    s=None,
+    axes=None,
+    out=None,
+    direction=+1,
+    overwrite_x=False,
+    scale_function=lambda n, ind: 1.0,
+):
+    a = np.asarray(a)
+    s, axes = _init_nd_shape_and_axes(a, s, axes)
+    ovwr = overwrite_x
+    for ii in reversed(range(len(axes))):
+        a = _c2c_fft1d_impl(
+            a,
+            n=s[ii],
+            axis=axes[ii],
+            overwrite_x=ovwr,
+            direction=direction,
+            fsc=scale_function(s[ii], ii),
+            out=out,
+        )
+        ovwr = True
+    return a
+
+
+def _output_dtype(dt):
+    if dt == np.float64:
+        return np.complex128
+    if dt == np.float32:
+        return np.complex64
+    return dt
+
+
+def _pad_array(arr, s, axes):
+    """Pads array arr with zeros to attain shape s associated with axes"""
+    arr_shape = arr.shape
+    no_padding = True
+    pad_widths = [(0, 0)] * len(arr_shape)
+    for si, ai in zip(s, axes):
+        try:
+            shp_i = arr_shape[ai]
+        except IndexError:
+            raise ValueError("Invalid axis (%d) specified" % ai)
+        if si > shp_i:
+            no_padding = False
+            pad_widths[ai] = (0, si - shp_i)
+    if no_padding:
+        return arr
+    return np.pad(arr, tuple(pad_widths), "constant")
+
+
+def _remove_axis(s, axes, axis_to_remove):
+    lens = len(s)
+    axes_normalized = tuple(lens + ai if ai < 0 else ai for ai in axes)
+    a2r = lens + axis_to_remove if axis_to_remove < 0 else axis_to_remove
+
+    ss = s[:a2r] + s[a2r + 1 :]
+    pivot = axes_normalized[a2r]
+    aa = tuple(
+        ai if ai < pivot else ai - 1 for ai in axes_normalized[:a2r]
+    ) + tuple(ai if ai < pivot else ai - 1 for ai in axes_normalized[a2r + 1 :])
+    return ss, aa
+
+
+def _trim_array(arr, s, axes):
+    """
+    Forms a view into subarray of arr if any element of shape parameter s is
+    smaller than the corresponding element of the shape of the input array arr,
+    otherwise returns the input array.
+    """
+
+    arr_shape = arr.shape
+    no_trim = True
+    ind = [slice(None, None, None)] * len(arr_shape)
+    for si, ai in zip(s, axes):
+        try:
+            shp_i = arr_shape[ai]
+        except IndexError:
+            raise ValueError("Invalid axis (%d) specified" % ai)
+        if si < shp_i:
+            no_trim = False
+            ind[ai] = slice(None, si, None)
+    if no_trim:
+        return arr
+    return arr[tuple(ind)]
+
+
 def _swap_direction(norm):
     _check_norm(norm)
     _swap_direction_map = {
@@ -61,3 +354,161 @@ def _swap_direction(norm):
     }
 
     return _swap_direction_map[norm]
+
+
+def _c2c_fftnd_impl(
+    x,
+    s=None,
+    axes=None,
+    overwrite_x=False,
+    direction=+1,
+    fsc=1.0,
+    out=None,
+):
+    if direction not in [-1, +1]:
+        raise ValueError("Direction of FFT should +1 or -1")
+
+    valid_dtypes = [np.complex64, np.complex128, np.float32, np.float64]
+    # _direct_fftnd requires complex type, and full-dimensional transform
+    if isinstance(x, np.ndarray) and x.size != 0 and x.ndim > 1:
+        _direct = s is None and axes is None
+        if _direct:
+            _direct = x.ndim <= 7  # Intel MKL only supports FFT up to 7D
+        if not _direct:
+            xs, xa = _cook_nd_args(x, s, axes)
+            if _check_shapes_for_direct(xs, x.shape, xa):
+                _direct = True
+        _direct = _direct and x.dtype in valid_dtypes
+    else:
+        _direct = False
+
+    if _direct:
+        return _direct_fftnd(
+            x,
+            overwrite_x=overwrite_x,
+            direction=direction,
+            fsc=fsc,
+            out=out,
+        )
+    else:
+        if s is None and x.dtype in valid_dtypes:
+            x = np.asarray(x)
+            if out is None:
+                res = np.empty_like(x, dtype=_output_dtype(x.dtype))
+            else:
+                _validate_out_array(out, x, _output_dtype(x.dtype))
+                res = out
+
+            return _iter_complementary(
+                x,
+                axes,
+                _direct_fftnd,
+                {
+                    "overwrite_x": overwrite_x,
+                    "direction": direction,
+                    "fsc": fsc,
+                },
+                res,
+            )
+        else:
+            # perform N-D FFT as a series of 1D FFTs
+            return _iter_fftnd(
+                x,
+                s=s,
+                axes=axes,
+                out=out,
+                direction=direction,
+                overwrite_x=overwrite_x,
+                scale_function=lambda n, i: fsc if i == 0 else 1.0,
+            )
+
+
+def _r2c_fftnd_impl(x, s=None, axes=None, fsc=1.0, out=None):
+    a = np.asarray(x)
+    no_trim = (s is None) and (axes is None)
+    s, axes = _cook_nd_args(a, s, axes)
+    la = axes[-1]
+    # trim array, so that rfft avoids doing unnecessary computations
+    if not no_trim:
+        a = _trim_array(a, s, axes)
+    # r2c along last axis
+    a = _r2c_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=out)
+    if len(s) > 1:
+        if not no_trim:
+            ss = list(s)
+            ss[-1] = a.shape[la]
+            a = _pad_array(a, tuple(ss), axes)
+        len_axes = len(axes)
+        if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
+            # a series of ND c2c FFTs along last axis
+            ss, aa = _remove_axis(s, axes, -1)
+            ind = [
+                slice(None, None, 1),
+            ] * len(s)
+            for ii in range(a.shape[la]):
+                ind[la] = ii
+                tind = tuple(ind)
+                a_inp = a[tind]
+                res = out[tind] if out is not None else None
+                a_res = _c2c_fftnd_impl(
+                    a_inp, s=ss, axes=aa, overwrite_x=True, direction=1, out=res
+                )
+                if a_res is not a_inp:
+                    a[tind] = a_res  # copy in place
+        else:
+            # a series of 1D c2c FFTs along all axes except last
+            for ii in range(len(axes) - 2, -1, -1):
+                a = _c2c_fft1d_impl(a, s[ii], axes[ii], overwrite_x=True)
+    return a
+
+
+def _c2r_fftnd_impl(x, s=None, axes=None, fsc=1.0, out=None):
+    a = np.asarray(x)
+    no_trim = (s is None) and (axes is None)
+    s, axes = _cook_nd_args(a, s, axes, invreal=True)
+    la = axes[-1]
+    if not no_trim:
+        a = _trim_array(a, s, axes)
+    if len(s) > 1:
+        if not no_trim:
+            a = _pad_array(a, s, axes)
+        ovr_x = True if _datacopied(a, x) else False
+        len_axes = len(axes)
+        if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
+            # a series of ND c2c FFTs along last axis
+            # due to need to write into a, we must copy
+            if not ovr_x:
+                a = a.copy()
+                ovr_x = True
+            if not np.issubdtype(a.dtype, np.complexfloating):
+                # complex output will be copied to input, copy is needed
+                if a.dtype == np.float32:
+                    a = a.astype(np.complex64)
+                else:
+                    a = a.astype(np.complex128)
+                ovr_x = True
+            ss, aa = _remove_axis(s, axes, -1)
+            ind = [
+                slice(None, None, 1),
+            ] * len(s)
+            for ii in range(a.shape[la]):
+                ind[la] = ii
+                tind = tuple(ind)
+                a_inp = a[tind]
+                # out has real dtype and cannot be used in intermediate steps
+                a_res = _c2c_fftnd_impl(
+                    a_inp, s=ss, axes=aa, overwrite_x=True, direction=-1
+                )
+                if a_res is not a_inp:
+                    a[tind] = a_res  # copy in place
+        else:
+            # a series of 1D c2c FFTs along all axes except last
+            for ii in range(len(axes) - 1):
+                # out has real dtype and cannot be used in intermediate steps
+                a = _c2c_fft1d_impl(
+                    a, s[ii], axes[ii], overwrite_x=ovr_x, direction=-1
+                )
+                ovr_x = True
+    # c2r along last axis
+    a = _c2r_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=out)
+    return a
diff --git a/mkl_fft/_mkl_fft.py b/mkl_fft/_mkl_fft.py
new file mode 100644
index 0000000..92d0d53
--- /dev/null
+++ b/mkl_fft/_mkl_fft.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# Copyright (c) 2025, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from ._fft_utils import _c2c_fftnd_impl, _c2r_fftnd_impl, _r2c_fftnd_impl
+
+# pylint: disable=no-name-in-module
+from ._pydfti import _c2c_fft1d_impl, _c2r_fft1d_impl, _r2c_fft1d_impl
+
+__all__ = [
+    "fft",
+    "ifft",
+    "fft2",
+    "ifft2",
+    "fftn",
+    "ifftn",
+    "rfft",
+    "irfft",
+    "rfft2",
+    "irfft2",
+    "rfftn",
+    "irfftn",
+]
+
+
+def fft(x, n=None, axis=-1, out=None, overwrite_x=False, fwd_scale=1.0):
+    return _c2c_fft1d_impl(
+        x,
+        n=n,
+        axis=axis,
+        out=out,
+        overwrite_x=overwrite_x,
+        direction=+1,
+        fsc=fwd_scale,
+    )
+
+
+def ifft(x, n=None, axis=-1, out=None, overwrite_x=False, fwd_scale=1.0):
+    return _c2c_fft1d_impl(
+        x,
+        n=n,
+        axis=axis,
+        out=out,
+        overwrite_x=overwrite_x,
+        direction=-1,
+        fsc=fwd_scale,
+    )
+
+
+def fft2(x, s=None, axes=(-2, -1), out=None, overwrite_x=False, fwd_scale=1.0):
+    return fftn(
+        x, s=s, axes=axes, out=out, overwrite_x=overwrite_x, fwd_scale=fwd_scale
+    )
+
+
+def ifft2(x, s=None, axes=(-2, -1), out=None, overwrite_x=False, fwd_scale=1.0):
+    return ifftn(
+        x, s=s, axes=axes, out=out, overwrite_x=overwrite_x, fwd_scale=fwd_scale
+    )
+
+
+def fftn(x, s=None, axes=None, out=None, overwrite_x=False, fwd_scale=1.0):
+    return _c2c_fftnd_impl(
+        x,
+        s=s,
+        axes=axes,
+        out=out,
+        overwrite_x=overwrite_x,
+        direction=+1,
+        fsc=fwd_scale,
+    )
+
+
+def ifftn(x, s=None, axes=None, out=None, overwrite_x=False, fwd_scale=1.0):
+    return _c2c_fftnd_impl(
+        x,
+        s=s,
+        axes=axes,
+        out=out,
+        overwrite_x=overwrite_x,
+        direction=-1,
+        fsc=fwd_scale,
+    )
+
+
+def rfft(x, n=None, axis=-1, out=None, fwd_scale=1.0):
+    return _r2c_fft1d_impl(x, n=n, axis=axis, out=out, fsc=fwd_scale)
+
+
+def irfft(x, n=None, axis=-1, out=None, fwd_scale=1.0):
+    return _c2r_fft1d_impl(x, n=n, axis=axis, out=out, fsc=fwd_scale)
+
+
+def rfft2(x, s=None, axes=(-2, -1), out=None, fwd_scale=1.0):
+    return rfftn(x, s=s, axes=axes, out=out, fwd_scale=fwd_scale)
+
+
+def irfft2(x, s=None, axes=(-2, -1), out=None, fwd_scale=1.0):
+    return irfftn(x, s=s, axes=axes, out=out, fwd_scale=fwd_scale)
+
+
+def rfftn(x, s=None, axes=None, out=None, fwd_scale=1.0):
+    return _r2c_fftnd_impl(x, s=s, axes=axes, out=out, fsc=fwd_scale)
+
+
+def irfftn(x, s=None, axes=None, out=None, fwd_scale=1.0):
+    return _c2r_fftnd_impl(x, s=s, axes=axes, out=out, fsc=fwd_scale)
diff --git a/mkl_fft/_pydfti.pyx b/mkl_fft/_pydfti.pyx
index 82cebee..74f1d69 100644
--- a/mkl_fft/_pydfti.pyx
+++ b/mkl_fft/_pydfti.pyx
@@ -180,31 +180,7 @@ cdef int _datacopied(cnp.ndarray arr, object orig):
     return 1 if (arr_obj.base is None) else 0
 
 
-def fft(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0, out=None):
-    return _fft1d_impl(
-        x,
-        n=n,
-        axis=axis,
-        overwrite_x=overwrite_x,
-        direction=+1,
-        fsc=fwd_scale,
-        out=out,
-    )
-
-
-def ifft(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0, out=None):
-    return _fft1d_impl(
-        x,
-        n=n,
-        axis=axis,
-        overwrite_x=overwrite_x,
-        direction=-1,
-        fsc=fwd_scale,
-        out=out,
-    )
-
-
-cdef cnp.ndarray pad_array(
+cdef cnp.ndarray _pad_array(
     cnp.ndarray x_arr, cnp.npy_intp n, int axis, int realQ
 ):
     "Internal utility to zero-pad input array along given axis"
@@ -244,7 +220,7 @@ cdef cnp.ndarray pad_array(
     return b_arr
 
 
-cdef cnp.ndarray  __process_arguments(
+cdef cnp.ndarray _process_arguments(
     object x,
     object n,
     object axis,
@@ -312,7 +288,7 @@ cdef cnp.ndarray  __process_arguments(
             err = 1
         elif n_[0] > n_max:
             in_place[0] = 1  # we must copy to pad and will work in-place
-            x_arr = pad_array(x_arr, n_[0], axis_[0], realQ)
+            x_arr = _pad_array(x_arr, n_[0], axis_[0], realQ)
 
     if err:
         raise ValueError(
@@ -323,7 +299,7 @@ cdef cnp.ndarray  __process_arguments(
     return x_arr
 
 
-cdef cnp.ndarray __allocate_result(
+cdef cnp.ndarray _allocate_result(
     cnp.ndarray x_arr, long n_, long axis_, int f_type
 ):
     """
@@ -352,6 +328,20 @@ cdef cnp.ndarray __allocate_result(
     return f_arr
 
 
+cdef int _is_integral(object num):
+    cdef long n
+    cdef int _integral
+    if num is None:
+        return 0
+    try:
+        n = PyInt_AsLong(num)
+        _integral = 1 if n > 0 else 0
+    except:
+        _integral = 0
+
+    return _integral
+
+
 def _get_element_strides(array):
     """Convert byte strides to element strides."""
 
@@ -386,7 +376,7 @@ def _validate_out_array(out, x, out_dtype, axis=None, n=None):
 # float/double inputs are not cast to complex, but are effectively
 # treated as complexes with zero imaginary parts.
 # All other types are cast to complex double.
-def _fft1d_impl(
+def _c2c_fft1d_impl(
     x,
     n=None,
     axis=-1,
@@ -408,8 +398,8 @@ def _fft1d_impl(
     cdef bytes py_error_msg
     cdef DftiCache *_cache
 
-    x_arr = __process_arguments(x, n, axis, overwrite_x, direction,
-                                &axis_, &n_, &in_place, &xnd, &dir_, 0)
+    x_arr = _process_arguments(x, n, axis, overwrite_x, direction,
+                               &axis_, &n_, &in_place, &xnd, &dir_, 0)
 
     x_type = cnp.PyArray_TYPE(x_arr)
 
@@ -483,7 +473,7 @@ def _fft1d_impl(
             f_type = cnp.NPY_CDOUBLE
 
         if out is None:
-            f_arr = __allocate_result(x_arr, n_, axis_, f_type)
+            f_arr = _allocate_result(x_arr, n_, axis_, f_type)
         else:
             out_dtype = np.dtype(cnp.PyArray_DescrFromType(f_type))
             _validate_out_array(out, x, out_dtype, axis=axis_, n=n_)
@@ -493,7 +483,7 @@ def _fft1d_impl(
             if _get_element_strides(x) == _get_element_strides(out):
                 f_arr = <cnp.ndarray> out
             else:
-                f_arr = __allocate_result(x_arr, n_, axis_, f_type)
+                f_arr = _allocate_result(x_arr, n_, axis_, f_type)
 
         # call out-of-place FFT
         _cache_capsule = _tls_dfti_cache_capsule()
@@ -575,448 +565,164 @@ def _fft1d_impl(
             return f_arr
 
 
-def rfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
-    """Packed real-valued harmonics of FFT of a real sequence x"""
-    return _rr_fft1d_impl2(
-        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
-    )
-
-
-def irfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
-    """
-    Inverse FFT of a real sequence, takes packed real-valued harmonics of FFT
-    """
-    return _rr_ifft1d_impl2(
-        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
-    )
-
-
-cdef object _rc_to_rr(cnp.ndarray rc_arr, int n, int axis, int xnd, int x_type):
-    cdef object res
-    inp = <object>rc_arr
-
-    slice_ = [slice(None, None, None)] * xnd
-    sl_0 = list(slice_)
-    sl_0[axis] = 0
-
-    sl_1 = list(slice_)
-    sl_1[axis] = 1
-    if (inp.flags["C"] and inp.strides[axis] == inp.itemsize):
-        res = inp
-        res = res.view(
-            dtype=np.single if x_type == cnp.NPY_FLOAT else np.double
-        )
-        res[tuple(sl_1)] = res[tuple(sl_0)]
-
-        slice_[axis] = slice(1, n + 1, None)
-
-        return res[tuple(slice_)]
-    else:
-        res_shape = list(inp.shape)
-        res_shape[axis] = n
-        res = np.empty(
-            tuple(res_shape),
-            dtype = np.single if x_type == cnp.NPY_FLOAT else np.double,
-        )
-
-        res[tuple(sl_0)] = inp[tuple(sl_0)].real
-        sl_dst_real = list(slice_)
-        sl_dst_real[axis] = slice(1, None, 2)
-        sl_src_real = list(slice_)
-        sl_src_real[axis] = slice(1, None, None)
-        res[tuple(sl_dst_real)] = inp[tuple(sl_src_real)].real
-        sl_dst_imag = list(slice_)
-        sl_dst_imag[axis] = slice(2, None, 2)
-        sl_src_imag = list(slice_)
-        sl_src_imag[axis] = slice(
-            1, inp.shape[axis] if (n & 1) else inp.shape[axis] - 1, None
-        )
-        res[tuple(sl_dst_imag)] = inp[tuple(sl_src_imag)].imag
-
-        return res[tuple(slice_)]
-
-cdef object _rr_to_rc(cnp.ndarray rr_arr, int n, int axis, int xnd, int x_type):
-
-    inp = <object> rr_arr
-
-    rc_shape = list(inp.shape)
-    rc_shape[axis] = (n // 2 + 1)
-    rc_shape = tuple(rc_shape)
-
-    rc_dtype = np.cdouble if x_type == cnp.NPY_DOUBLE else np.csingle
-    rc = np.empty(rc_shape, dtype=rc_dtype, order="C")
-
-    slice_ = [slice(None, None, None)] * xnd
-    sl_src_real = list(slice_)
-    sl_src_imag = list(slice_)
-    sl_src_real[axis] = slice(1, n, 2)
-    sl_src_imag[axis] = slice(2, n, 2)
-
-    sl_dest_real = list(slice_)
-    sl_dest_real[axis] = slice(1, None, None)
-    sl_dest_imag = list(slice_)
-    sl_dest_imag[axis] = slice(1, (n+1)//2, None)
-
-    sl_0 = list(slice_)
-    sl_0[axis] = 0
-
-    rc_real = rc.real
-    rc_imag = rc.imag
-
-    rc_real[tuple(sl_dest_real)] = inp[tuple(sl_src_real)]
-    rc_imag[tuple(sl_dest_imag)] = inp[tuple(sl_src_imag)]
-    rc_real[tuple(sl_0)] = inp[tuple(sl_0)]
-    rc_imag[tuple(sl_0)] = 0
-    if (n & 1 == 0):
-        sl_last = list(slice_)
-        sl_last[axis] = -1
-        rc_imag[tuple(sl_last)] = 0
-
-    return rc
-
-
-def _repack_rr_to_rc(x, n, axis):
-    """Debugging utility"""
-    cdef cnp.ndarray x_arr
-    cdef int n_ = n, axis_ = axis
-    cdef x_type
-
-    x_arr = <cnp.ndarray> np.asarray(x)
-    x_type = cnp.PyArray_TYPE(x_arr)
-    return _rr_to_rc(x, n_, axis_, cnp.PyArray_NDIM(x_arr), x_type)
-
-
-def _repack_rc_to_rr(x, n, axis):
-    """Debugging utility"""
-    cdef cnp.ndarray x_arr
-    cdef int n_ = n, axis_ = axis
-    cdef int c_type, x_type
-
-    x_arr = <cnp.ndarray> np.asarray(x)
-    c_type = cnp.PyArray_TYPE(x_arr)
-    x_type = cnp.NPY_DOUBLE if c_type == cnp.NPY_CDOUBLE else cnp.NPY_FLOAT
-    return _rc_to_rr(x, n_, axis_, cnp.PyArray_NDIM(x_arr), x_type)
-
-
-def _rr_fft1d_impl2(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
+def _r2c_fft1d_impl(
+    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
+):
     """
-    Uses MKL to perform real packed 1D FFT on the input array x
-    along the given axis.
-
-    This done by using rfft and post-processing the result.
-    Thus overwrite_x is effectively discarded.
+    Uses MKL to perform 1D FFT on the real input array x along the given axis,
+    producing complex output, but giving only half of the harmonics.
 
-    Functionally equivalent to scipy.fftpack.rfft
+    cf. numpy.fft.rfft
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
     cdef int xnd, in_place, dir_
     cdef long n_, axis_
+    cdef int x_type, f_type, status, requirement
     cdef int HALF_HARMONICS = 0  # give only positive index harmonics
-    cdef int x_type, status, f_type
+    cdef int direction = 1  # dummy, only used for the sake of arg-processing
     cdef char * c_error_msg = NULL
     cdef bytes py_error_msg
     cdef DftiCache *_cache
 
-    x_arr = __process_arguments(x, n, axis, overwrite_x, <object>(+1),
-                                &axis_, &n_, &in_place, &xnd, &dir_, 1)
+    x_arr = _process_arguments(x, n, axis, overwrite_x, direction,
+                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
 
     x_type = cnp.PyArray_TYPE(x_arr)
 
-    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
-        in_place = 0
-    elif x_type is cnp.NPY_CFLOAT or x_type is cnp.NPY_CDOUBLE:
-        raise TypeError("1st argument must be a real sequence")
+    if (
+        x_type is cnp.NPY_CFLOAT
+        or x_type is cnp.NPY_CDOUBLE
+        or x_type is cnp.NPY_CLONGDOUBLE
+    ):
+        raise TypeError("1st argument must be a real sequence.")
+    elif x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
+        pass
     else:
+        # we must cast the input to doubles and allocate the output,
         try:
+            requirement = cnp.NPY_BEHAVED | cnp.NPY_ENSURECOPY
+            if x_type is cnp.NPY_LONGDOUBLE:
+                requirement = requirement | cnp.NPY_FORCECAST
             x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_DOUBLE, cnp.NPY_BEHAVED | cnp.NPY_ENSURECOPY)
+                x_arr, cnp.NPY_DOUBLE, requirement)
+            x_type = cnp.PyArray_TYPE(x_arr)
         except:
-            raise TypeError("1st argument must be a real sequence")
-        x_type = cnp.PyArray_TYPE(x_arr)
-        in_place = 0
+            raise TypeError("1st argument must be a real sequence 2")
 
+    # in_place is ignored here.
+    # it can be done only if 2*(n_ // 2 + 1)  <= x_arr.shape[axis_] which is not
+    # the common usage
     f_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
-    f_arr = __allocate_result(x_arr, n_ // 2 + 1, axis_, f_type)
+    f_shape = n_ // 2 + 1
+    if out is None:
+        f_arr = _allocate_result(x_arr, f_shape, axis_, f_type)
+    else:
+        out_dtype = np.dtype(cnp.PyArray_DescrFromType(f_type))
+        _validate_out_array(out, x, out_dtype, axis=axis_, n=f_shape)
+        # out array that is used in OneMKL r2c FFT must have comparable strides
+        # with input array. If not, we need to allocate a new array.
+        # For r2c, out and input arrays have different size and strides cannot
+        # be compared directly.
+        # TODO: currently instead of this condition, we check both input
+        # and output to be c_contig or f_contig, relax this condition
+        c_contig = x.flags.c_contiguous and out.flags.c_contiguous
+        f_contig = x.flags.f_contiguous and out.flags.f_contiguous
+        if c_contig or f_contig:
+            f_arr = <cnp.ndarray> out
+        else:
+            f_arr = _allocate_result(x_arr, f_shape, axis_, f_type)
 
-    _cache_capsule = _tls_dfti_cache_capsule()
-    _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-        _cache_capsule, capsule_name
-    )
-    if x_type is cnp.NPY_DOUBLE:
-        status = double_cdouble_mkl_fft1d_out(
+    # call out-of-place FFT
+    if x_type is cnp.NPY_FLOAT:
+        _cache_capsule = _tls_dfti_cache_capsule()
+        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
+            _cache_capsule, capsule_name
+        )
+        status = float_cfloat_mkl_fft1d_out(
             x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
         )
     else:
-        status = float_cfloat_mkl_fft1d_out(
+        _cache_capsule = _tls_dfti_cache_capsule()
+        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
+            _cache_capsule, capsule_name
+        )
+        status = double_cdouble_mkl_fft1d_out(
             x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
         )
 
     if (status):
         c_error_msg = mkl_dfti_error(status)
         py_error_msg = c_error_msg
-        raise ValueError("Internal error occurred: {}".format(py_error_msg))
+        raise ValueError(
+            "Internal error occurred: {}".format(str(py_error_msg))
+        )
 
-    # post-process and return
-    return _rc_to_rr(f_arr, n_, axis_, xnd, x_type)
+    if out is not None and f_arr is not out:
+        out[...] = f_arr
+        return out
+    else:
+        return f_arr
 
 
-def _rr_ifft1d_impl2(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
+# this routine is functionally equivalent to numpy.fft.irfft
+def _c2r_fft1d_impl(
+    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
+):
     """
-    Uses MKL to perform real packed 1D FFT on the input array x along
-    the given axis.
-
-    This done by using rfft and post-processing the result.
-    Thus overwrite_x is effectively discarded.
+    Uses MKL to perform 1D FFT on the real input array x along the given axis,
+    producing complex output, but giving only half of the harmonics.
 
-    Functionally equivalent to scipy.fftpack.irfft
+    cf. numpy.fft.irfft
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
+    cdef int xnd, in_place, dir_, int_n
     cdef long n_, axis_
-    cdef int x_type, rc_type, status
+    cdef int x_type, f_type, status
+    cdef int direction = 1  # dummy, only used for the sake of arg-processing
     cdef char * c_error_msg = NULL
     cdef bytes py_error_msg
     cdef DftiCache *_cache
 
-    x_arr = __process_arguments(x, n, axis, overwrite_x, <object>(-1),
-                                &axis_, &n_, &in_place, &xnd, &dir_, 1)
+    int_n = _is_integral(n)
+    # nn gives the number elements along axis of the input that we use
+    nn = (n // 2 + 1) if int_n and n > 0 else n
+    x_arr = _process_arguments(x, nn, axis, overwrite_x, direction,
+                               &axis_, &n_, &in_place, &xnd, &dir_, 0)
+    n_ = 2*(n_ - 1)
+    if int_n and (n % 2 == 1):
+        n_ += 1
 
     x_type = cnp.PyArray_TYPE(x_arr)
 
-    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
-        pass
+    if x_type is cnp.NPY_CFLOAT or x_type is cnp.NPY_CDOUBLE:
+        # we can operate in place if requested.
+        if in_place:
+            if not cnp.PyArray_ISONESEGMENT(x_arr):
+                in_place = 0 if internal_overlap(x_arr) else 1
     else:
         # we must cast the input and allocate the output,
         # so we cast to complex double and operate in place
         try:
             x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_DOUBLE, cnp.NPY_BEHAVED | cnp.NPY_ENSURECOPY)
+                x_arr, cnp.NPY_CDOUBLE, cnp.NPY_BEHAVED)
         except:
             raise ValueError(
-                "First argument should be a real "
-                "or a complex sequence of single or double precision"
+                "First argument should be a real or "
+                "a complex sequence of single or double precision"
             )
         x_type = cnp.PyArray_TYPE(x_arr)
         in_place = 1
 
-    # need to convert this into complex array
-    rc_obj = _rr_to_rc(x_arr, n_, axis_, xnd, x_type)
-    rc_arr = <cnp.ndarray> rc_obj
-
-    rc_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
-    in_place = False
+    in_place = 0
     if in_place:
-        f_arr = x_arr
-    else:
-        f_arr = __allocate_result(x_arr, n_, axis_, x_type)
-
-    # call out-of-place FFT
-    if rc_type is cnp.NPY_CFLOAT:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = cfloat_float_mkl_irfft_out(
-            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
-        )
-    elif rc_type is cnp.NPY_CDOUBLE:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = cdouble_double_mkl_irfft_out(
-            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
-        )
-    else:
-        raise ValueError(
-            "Internal mkl_fft error occurred: Unrecognized rc_type"
-        )
-
-    if (status):
-        c_error_msg = mkl_dfti_error(status)
-        py_error_msg = c_error_msg
-        raise ValueError(
-            "Internal error occurred: {}".format(str(py_error_msg))
-        )
-
-    return f_arr
-
-
-# this routine is functionally equivalent to numpy.fft.rfft
-def _rc_fft1d_impl(
-    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
-):
-    """
-    Uses MKL to perform 1D FFT on the real input array x along the given axis,
-    producing complex output, but giving only half of the harmonics.
-
-    cf. numpy.fft.rfft
-    """
-    cdef cnp.ndarray x_arr "x_arrayObject"
-    cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
-    cdef long n_, axis_
-    cdef int x_type, f_type, status, requirement
-    cdef int HALF_HARMONICS = 0  # give only positive index harmonics
-    cdef int direction = 1  # dummy, only used for the sake of arg-processing
-    cdef char * c_error_msg = NULL
-    cdef bytes py_error_msg
-    cdef DftiCache *_cache
-
-    x_arr = __process_arguments(x, n, axis, overwrite_x, direction,
-                                &axis_, &n_, &in_place, &xnd, &dir_, 1)
-
-    x_type = cnp.PyArray_TYPE(x_arr)
-
-    if (
-        x_type is cnp.NPY_CFLOAT
-        or x_type is cnp.NPY_CDOUBLE
-        or x_type is cnp.NPY_CLONGDOUBLE
-    ):
-        raise TypeError("1st argument must be a real sequence.")
-    elif x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
-        pass
-    else:
-        # we must cast the input to doubles and allocate the output,
-        try:
-            requirement = cnp.NPY_BEHAVED | cnp.NPY_ENSURECOPY
-            if x_type is cnp.NPY_LONGDOUBLE:
-                requirement = requirement | cnp.NPY_FORCECAST
-            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_DOUBLE, requirement)
-            x_type = cnp.PyArray_TYPE(x_arr)
-        except:
-            raise TypeError("1st argument must be a real sequence 2")
-
-    # in_place is ignored here.
-    # it can be done only if 2*(n_ // 2 + 1)  <= x_arr.shape[axis_] which is not
-    # the common usage
-    f_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
-    f_shape = n_ // 2 + 1
-    if out is None:
-        f_arr = __allocate_result(x_arr, f_shape, axis_, f_type)
-    else:
-        out_dtype = np.dtype(cnp.PyArray_DescrFromType(f_type))
-        _validate_out_array(out, x, out_dtype, axis=axis_, n=f_shape)
-        # out array that is used in OneMKL r2c FFT must have comparable strides
-        # with input array. If not, we need to allocate a new array.
-        # For r2c, out and input arrays have different size and strides cannot
-        # be compared directly.
-        # TODO: currently instead of this condition, we check both input
-        # and output to be c_contig or f_contig, relax this condition
-        c_contig = x.flags.c_contiguous and out.flags.c_contiguous
-        f_contig = x.flags.f_contiguous and out.flags.f_contiguous
-        if c_contig or f_contig:
-            f_arr = <cnp.ndarray> out
-        else:
-            f_arr = __allocate_result(x_arr, f_shape, axis_, f_type)
-
-    # call out-of-place FFT
-    if x_type is cnp.NPY_FLOAT:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = float_cfloat_mkl_fft1d_out(
-            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
-        )
-    else:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = double_cdouble_mkl_fft1d_out(
-            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
-        )
-
-    if (status):
-        c_error_msg = mkl_dfti_error(status)
-        py_error_msg = c_error_msg
-        raise ValueError(
-            "Internal error occurred: {}".format(str(py_error_msg))
-        )
-
-    if out is not None and f_arr is not out:
-        out[...] = f_arr
-        return out
-    else:
-        return f_arr
-
-
-cdef int _is_integral(object num):
-    cdef long n
-    cdef int _integral
-    if num is None:
-        return 0
-    try:
-        n = PyInt_AsLong(num)
-        _integral = 1 if n > 0 else 0
-    except:
-        _integral = 0
-
-    return _integral
-
-
-# this routine is functionally equivalent to numpy.fft.irfft
-def _rc_ifft1d_impl(
-    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
-):
-    """
-    Uses MKL to perform 1D FFT on the real input array x along the given axis,
-    producing complex output, but giving only half of the harmonics.
-
-    cf. numpy.fft.irfft
-    """
-    cdef cnp.ndarray x_arr "x_arrayObject"
-    cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_, int_n
-    cdef long n_, axis_
-    cdef int x_type, f_type, status
-    cdef int direction = 1  # dummy, only used for the sake of arg-processing
-    cdef char * c_error_msg = NULL
-    cdef bytes py_error_msg
-    cdef DftiCache *_cache
-
-    int_n = _is_integral(n)
-    # nn gives the number elements along axis of the input that we use
-    nn = (n // 2 + 1) if int_n and n > 0 else n
-    x_arr = __process_arguments(x, nn, axis, overwrite_x, direction,
-                                &axis_, &n_, &in_place, &xnd, &dir_, 0)
-    n_ = 2*(n_ - 1)
-    if int_n and (n % 2 == 1):
-        n_ += 1
-
-    x_type = cnp.PyArray_TYPE(x_arr)
-
-    if x_type is cnp.NPY_CFLOAT or x_type is cnp.NPY_CDOUBLE:
-        # we can operate in place if requested.
-        if in_place:
-            if not cnp.PyArray_ISONESEGMENT(x_arr):
-                in_place = 0 if internal_overlap(x_arr) else 1
-    else:
-        # we must cast the input and allocate the output,
-        # so we cast to complex double and operate in place
-        try:
-            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_CDOUBLE, cnp.NPY_BEHAVED)
-        except:
-            raise ValueError(
-                "First argument should be a real or "
-                "a complex sequence of single or double precision"
-            )
-        x_type = cnp.PyArray_TYPE(x_arr)
-        in_place = 1
-
-    in_place = 0
-    if in_place:
-        # TODO: Provide in-place functionality
-        pass
+        # TODO: Provide in-place functionality
+        pass
     else:
         f_type = cnp.NPY_FLOAT if x_type is cnp.NPY_CFLOAT else cnp.NPY_DOUBLE
         if out is None:
-            f_arr = __allocate_result(x_arr, n_, axis_, f_type)
+            f_arr = _allocate_result(x_arr, n_, axis_, f_type)
         else:
             out_dtype = np.dtype(cnp.PyArray_DescrFromType(f_type))
             _validate_out_array(out, x, out_dtype, axis=axis_, n=n_)
@@ -1031,7 +737,7 @@ def _rc_ifft1d_impl(
             if c_contig or f_contig:
                 f_arr = <cnp.ndarray> out
             else:
-                f_arr = __allocate_result(x_arr, n_, axis_, f_type)
+                f_arr = _allocate_result(x_arr, n_, axis_, f_type)
 
         # call out-of-place FFT
         if x_type is cnp.NPY_CFLOAT:
@@ -1065,203 +771,6 @@ def _rc_ifft1d_impl(
             return f_arr
 
 
-def rfft(x, n=None, axis=-1, fwd_scale=1.0, out=None):
-    return _rc_fft1d_impl(x, n=n, axis=axis, fsc=fwd_scale, out=out)
-
-
-def irfft(x, n=None, axis=-1, fwd_scale=1.0, out=None):
-    return _rc_ifft1d_impl(x, n=n, axis=axis, fsc=fwd_scale, out=out)
-
-
-# ============================== ND ====================================== #
-
-# copied from scipy.fftpack.helper
-def _init_nd_shape_and_axes(x, shape, axes):
-    """Handle shape and axes arguments for n-dimensional transforms.
-    Returns the shape and axes in a standard form, taking into account negative
-    values and checking for various potential errors.
-    Parameters
-    ----------
-    x : array_like
-        The input array.
-    shape : int or array_like of ints or None
-        The shape of the result.  If both `shape` and `axes` (see below) are
-        None, `shape` is ``x.shape``; if `shape` is None but `axes` is
-        not None, then `shape` is ``scipy.take(x.shape, axes, axis=0)``.
-        If `shape` is -1, the size of the corresponding dimension of `x` is
-        used.
-    axes : int or array_like of ints or None
-        Axes along which the calculation is computed.
-        The default is over all axes.
-        Negative indices are automatically converted to their positive
-        counterpart.
-    Returns
-    -------
-    shape : array
-        The shape of the result. It is a 1D integer array.
-    axes : array
-        The shape of the result. It is a 1D integer array.
-    """
-    x = np.asarray(x)
-    noshape = shape is None
-    noaxes = axes is None
-
-    if noaxes:
-        axes = np.arange(x.ndim, dtype=np.intc)
-    else:
-        axes = np.atleast_1d(axes)
-
-    if axes.size == 0:
-        axes = axes.astype(np.intc)
-
-    if not axes.ndim == 1:
-        raise ValueError("when given, axes values must be a scalar or vector")
-    if not np.issubdtype(axes.dtype, np.integer):
-        raise ValueError("when given, axes values must be integers")
-
-    axes = np.where(axes < 0, axes + x.ndim, axes)
-
-    if axes.size != 0 and (axes.max() >= x.ndim or axes.min() < 0):
-        raise ValueError("axes exceeds dimensionality of input")
-    if axes.size != 0 and np.unique(axes).shape != axes.shape:
-        raise ValueError("all axes must be unique")
-
-    if not noshape:
-        shape = np.atleast_1d(shape)
-    elif np.isscalar(x):
-        shape = np.array([], dtype=np.intc)
-    elif noaxes:
-        shape = np.array(x.shape, dtype=np.intc)
-    else:
-        shape = np.take(x.shape, axes)
-
-    if shape.size == 0:
-        shape = shape.astype(np.intc)
-
-    if shape.ndim != 1:
-        raise ValueError("when given, shape values must be a scalar or vector")
-    if not np.issubdtype(shape.dtype, np.integer):
-        raise ValueError("when given, shape values must be integers")
-    if axes.shape != shape.shape:
-        raise ValueError("when given, axes and shape arguments"
-                         " have to be of the same length")
-
-    shape = np.where(shape == -1, np.array(x.shape)[axes], shape)
-
-    if shape.size != 0 and (shape < 1).any():
-        raise ValueError(
-            "invalid number of data points ({0}) specified".format(shape))
-
-    return shape, axes
-
-
-def _cook_nd_args(a, s=None, axes=None, invreal=False):
-    if s is None:
-        shapeless = True
-        if axes is None:
-            s = list(a.shape)
-        else:
-            try:
-                s = [a.shape[i] for i in axes]
-            except IndexError:
-                # fake s designed to trip the ValueError further down
-                s = range(len(axes) + 1)
-                pass
-    else:
-        shapeless = False
-    s = list(s)
-    if axes is None:
-        axes = list(range(-len(s), 0))
-    if len(s) != len(axes):
-        raise ValueError("Shape and axes have different lengths.")
-    if invreal and shapeless:
-        s[-1] = (a.shape[axes[-1]] - 1) * 2
-    return s, axes
-
-
-def _iter_fftnd(
-    a,
-    s=None,
-    axes=None,
-    function=fft,
-    overwrite_x=False,
-    scale_function=lambda n,
-    ind: 1.0,
-    out=None,
-):
-    a = np.asarray(a)
-    s, axes = _init_nd_shape_and_axes(a, s, axes)
-    ovwr = overwrite_x
-    for ii in reversed(range(len(axes))):
-        a = function(
-            a,
-            n = s[ii],
-            axis = axes[ii],
-            overwrite_x=ovwr,
-            fwd_scale=scale_function(s[ii], ii),
-            out=out,
-        )
-        ovwr = True
-    return a
-
-
-def flat_to_multi(ind, shape):
-    nd = len(shape)
-    m_ind = [-1] * nd
-    j = ind
-    for i in range(nd):
-        si = shape[nd-1-i]
-        q = j // si
-        r = j - si * q
-        m_ind[nd-1-i] = r
-        j = q
-    return m_ind
-
-
-def iter_complementary(x, axes, func, kwargs, result):
-    if axes is None:
-        # s and axes are None, direct N-D FFT
-        return func(x, **kwargs, out=result)
-    x_shape = x.shape
-    nd = x.ndim
-    r = list(range(nd))
-    sl = [slice(None, None, None)] * nd
-    if not np.iterable(axes):
-        axes = (axes,)
-    for ai in axes:
-        r[ai] = None
-    size = 1
-    sub_shape = []
-    dual_ind = []
-    for ri in r:
-        if ri is not None:
-            size *= x_shape[ri]
-            sub_shape.append(x_shape[ri])
-            dual_ind.append(ri)
-
-    for ind in range(size):
-        m_ind = flat_to_multi(ind, sub_shape)
-        for k1, k2 in zip(dual_ind, m_ind):
-            sl[k1] = k2
-        if np.issubdtype(x.dtype, np.complexfloating):
-            func(x[tuple(sl)], **kwargs, out=result[tuple(sl)])
-        else:
-            # For c2c FFT, if the input is real, half of the output is the
-            # complex conjugate of the other half. Instead of upcasting the
-            # input to complex and performing c2c FFT, we perform rfft and then
-            # construct the other half using the first half.
-            # However, when using the `out` keyword here I encountered a result
-            # in tests/third_party/scipy/test_basic.py::test_fft_with_order
-            # that was correct but the elements were not necessarily similar to
-            # NumPy. For example, an element in the first half of mkl_fft output
-            # array appeared in the second half of the NumPy output array,
-            # while the equivalent element in the NumPy array was the conjugate
-            # of the mkl_fft output array.
-            np.copyto(result[tuple(sl)], func(x[tuple(sl)], **kwargs))
-
-    return result
-
-
 def _direct_fftnd(
     x, overwrite_x=False, direction=+1, double fsc=1.0, out=None
 ):
@@ -1335,7 +844,7 @@ def _direct_fftnd(
         else:
             f_type = cnp.NPY_CFLOAT
         if out is None:
-            f_arr = __allocate_result(x_arr, -1, 0, f_type)
+            f_arr = _allocate_result(x_arr, -1, 0, f_type)
         else:
             out_dtype = np.dtype(cnp.PyArray_DescrFromType(f_type))
             _validate_out_array(out, x, out_dtype)
@@ -1345,7 +854,7 @@ def _direct_fftnd(
             if _get_element_strides(x) == _get_element_strides(out):
                 f_arr = <cnp.ndarray> out
             else:
-                f_arr = __allocate_result(x_arr, -1, 0, f_type)
+                f_arr = _allocate_result(x_arr, -1, 0, f_type)
 
         if x_type == cnp.NPY_CDOUBLE:
             if dir_ == 1:
@@ -1377,285 +886,247 @@ def _direct_fftnd(
             return f_arr
 
 
-def _check_shapes_for_direct(xs, shape, axes):
-    if len(axes) > 7:  # Intel MKL supports up to 7D
-        return False
-    if not (len(xs) == len(shape)):
-        # full-dimensional transform
-        return False
-    if not (len(set(axes)) == len(axes)):
-        # repeated axes
-        return False
-    for xsi, ai in zip(xs, axes):
-        try:
-            sh_ai = shape[ai]
-        except IndexError:
-            raise ValueError("Invalid axis (%d) specified" % ai)
+# ========================= deprecated functions ==============================
+cdef object _rc_to_rr(cnp.ndarray rc_arr, int n, int axis, int xnd, int x_type):
+    cdef object res
+    inp = <object>rc_arr
 
-        if not (xsi == sh_ai):
-            return False
-    return True
+    slice_ = [slice(None, None, None)] * xnd
+    sl_0 = list(slice_)
+    sl_0[axis] = 0
 
+    sl_1 = list(slice_)
+    sl_1[axis] = 1
+    if (inp.flags["C"] and inp.strides[axis] == inp.itemsize):
+        res = inp
+        res = res.view(
+            dtype=np.single if x_type == cnp.NPY_FLOAT else np.double
+        )
+        res[tuple(sl_1)] = res[tuple(sl_0)]
 
-def _output_dtype(dt):
-    if dt == np.double:
-        return np.cdouble
-    if dt == np.single:
-        return np.csingle
-    return dt
+        slice_[axis] = slice(1, n + 1, None)
 
+        return res[tuple(slice_)]
+    else:
+        res_shape = list(inp.shape)
+        res_shape[axis] = n
+        res = np.empty(
+            tuple(res_shape),
+            dtype = np.single if x_type == cnp.NPY_FLOAT else np.double,
+        )
 
-def _fftnd_impl(
-    x,
-    s=None,
-    axes=None,
-    overwrite_x=False,
-    direction=+1,
-    double fsc=1.0,
-    out=None,
-):
-    if direction not in [-1, +1]:
-        raise ValueError("Direction of FFT should +1 or -1")
+        res[tuple(sl_0)] = inp[tuple(sl_0)].real
+        sl_dst_real = list(slice_)
+        sl_dst_real[axis] = slice(1, None, 2)
+        sl_src_real = list(slice_)
+        sl_src_real[axis] = slice(1, None, None)
+        res[tuple(sl_dst_real)] = inp[tuple(sl_src_real)].real
+        sl_dst_imag = list(slice_)
+        sl_dst_imag[axis] = slice(2, None, 2)
+        sl_src_imag = list(slice_)
+        sl_src_imag[axis] = slice(
+            1, inp.shape[axis] if (n & 1) else inp.shape[axis] - 1, None
+        )
+        res[tuple(sl_dst_imag)] = inp[tuple(sl_src_imag)].imag
+
+        return res[tuple(slice_)]
+
+
+cdef object _rr_to_rc(cnp.ndarray rr_arr, int n, int axis, int xnd, int x_type):
+
+    inp = <object> rr_arr
+
+    rc_shape = list(inp.shape)
+    rc_shape[axis] = (n // 2 + 1)
+    rc_shape = tuple(rc_shape)
+
+    rc_dtype = np.cdouble if x_type == cnp.NPY_DOUBLE else np.csingle
+    rc = np.empty(rc_shape, dtype=rc_dtype, order="C")
+
+    slice_ = [slice(None, None, None)] * xnd
+    sl_src_real = list(slice_)
+    sl_src_imag = list(slice_)
+    sl_src_real[axis] = slice(1, n, 2)
+    sl_src_imag[axis] = slice(2, n, 2)
+
+    sl_dest_real = list(slice_)
+    sl_dest_real[axis] = slice(1, None, None)
+    sl_dest_imag = list(slice_)
+    sl_dest_imag[axis] = slice(1, (n+1)//2, None)
+
+    sl_0 = list(slice_)
+    sl_0[axis] = 0
+
+    rc_real = rc.real
+    rc_imag = rc.imag
+
+    rc_real[tuple(sl_dest_real)] = inp[tuple(sl_src_real)]
+    rc_imag[tuple(sl_dest_imag)] = inp[tuple(sl_src_imag)]
+    rc_real[tuple(sl_0)] = inp[tuple(sl_0)]
+    rc_imag[tuple(sl_0)] = 0
+    if (n & 1 == 0):
+        sl_last = list(slice_)
+        sl_last[axis] = -1
+        rc_imag[tuple(sl_last)] = 0
+
+    return rc
+
+
+def _rr_fft1d_impl(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
+    """
+    Uses MKL to perform real packed 1D FFT on the input array x
+    along the given axis.
+
+    This done by using rfft and post-processing the result.
+    Thus overwrite_x is effectively discarded.
+
+    Functionally equivalent to scipy.fftpack.rfft
+    """
+    cdef cnp.ndarray x_arr "x_arrayObject"
+    cdef cnp.ndarray f_arr "f_arrayObject"
+    cdef int xnd, in_place, dir_
+    cdef long n_, axis_
+    cdef int HALF_HARMONICS = 0  # give only positive index harmonics
+    cdef int x_type, status, f_type
+    cdef char * c_error_msg = NULL
+    cdef bytes py_error_msg
+    cdef DftiCache *_cache
+
+    x_arr = _process_arguments(x, n, axis, overwrite_x, <object>(+1),
+                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
+
+    x_type = cnp.PyArray_TYPE(x_arr)
 
-    valid_dtypes = [np.complex64, np.complex128, np.float32, np.float64]
-    # _direct_fftnd requires complex type, and full-dimensional transform
-    if isinstance(x, np.ndarray) and x.size != 0 and x.ndim > 1:
-        _direct = s is None and axes is None
-        if _direct:
-            _direct = x.ndim <= 7  # Intel MKL only supports FFT up to 7D
-        if not _direct:
-            xs, xa = _cook_nd_args(x, s, axes)
-            if _check_shapes_for_direct(xs, x.shape, xa):
-                _direct = True
-        _direct = _direct and x.dtype in valid_dtypes
+    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
+        in_place = 0
+    elif x_type is cnp.NPY_CFLOAT or x_type is cnp.NPY_CDOUBLE:
+        raise TypeError("1st argument must be a real sequence")
     else:
-        _direct = False
-
-    if _direct:
-        return _direct_fftnd(
-            x,
-            overwrite_x=overwrite_x,
-            direction=direction,
-            fsc=fsc,
-            out=out,
+        try:
+            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
+                x_arr, cnp.NPY_DOUBLE, cnp.NPY_BEHAVED | cnp.NPY_ENSURECOPY)
+        except:
+            raise TypeError("1st argument must be a real sequence")
+        x_type = cnp.PyArray_TYPE(x_arr)
+        in_place = 0
+
+    f_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
+    f_arr = _allocate_result(x_arr, n_ // 2 + 1, axis_, f_type)
+
+    _cache_capsule = _tls_dfti_cache_capsule()
+    _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
+        _cache_capsule, capsule_name
+    )
+    if x_type is cnp.NPY_DOUBLE:
+        status = double_cdouble_mkl_fft1d_out(
+            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
         )
     else:
-        if s is None and x.dtype in valid_dtypes:
-            x = np.asarray(x)
-            if out is None:
-                res = np.empty_like(x, dtype=_output_dtype(x.dtype))
-            else:
-                _validate_out_array(out, x, _output_dtype(x.dtype))
-                res = out
-
-            return iter_complementary(
-                x, axes,
-                _direct_fftnd,
-                {
-                    "overwrite_x": overwrite_x,
-                    "direction": direction,
-                    "fsc": fsc
-                },
-                res
-            )
-        else:
-            # perform N-D FFT as a series of 1D FFTs
-            sc = <object> fsc
-            return _iter_fftnd(x, s=s, axes=axes,
-                               overwrite_x=overwrite_x,
-                               scale_function=lambda n, i: sc if i == 0 else 1.,
-                               function=fft if direction == 1 else ifft,
-                               out=out)
-
-
-def fft2(x, s=None, axes=(-2, -1), overwrite_x=False, fwd_scale=1.0, out=None):
-    return _fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        overwrite_x=overwrite_x,
-        direction=+1,
-        fsc=fwd_scale,
-        out=out,
-    )
+        status = float_cfloat_mkl_fft1d_out(
+            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
+        )
 
+    if (status):
+        c_error_msg = mkl_dfti_error(status)
+        py_error_msg = c_error_msg
+        raise ValueError("Internal error occurred: {}".format(py_error_msg))
 
-def ifft2(x, s=None, axes=(-2, -1), overwrite_x=False, fwd_scale=1.0, out=None):
-    return _fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        overwrite_x=overwrite_x,
-        direction=-1,
-        fsc=fwd_scale,
-        out=out,
-    )
+    # post-process and return
+    return _rc_to_rr(f_arr, n_, axis_, xnd, x_type)
 
 
-def fftn(x, s=None, axes=None, overwrite_x=False, fwd_scale=1.0, out=None):
-    return _fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        overwrite_x=overwrite_x,
-        direction=+1,
-        fsc=fwd_scale,
-        out=out,
-    )
+def _rr_ifft1d_impl(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
+    """
+    Uses MKL to perform real packed 1D FFT on the input array x along
+    the given axis.
 
+    This done by using rfft and post-processing the result.
+    Thus overwrite_x is effectively discarded.
 
-def ifftn(x, s=None, axes=None, overwrite_x=False, fwd_scale=1.0, out=None):
-    return _fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        overwrite_x=overwrite_x,
-        direction=-1,
-        fsc=fwd_scale,
-        out=out,
-    )
+    Functionally equivalent to scipy.fftpack.irfft
+    """
+    cdef cnp.ndarray x_arr "x_arrayObject"
+    cdef cnp.ndarray f_arr "f_arrayObject"
+    cdef int xnd, in_place, dir_
+    cdef long n_, axis_
+    cdef int x_type, rc_type, status
+    cdef char * c_error_msg = NULL
+    cdef bytes py_error_msg
+    cdef DftiCache *_cache
+
+    x_arr = _process_arguments(x, n, axis, overwrite_x, <object>(-1),
+                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
+
+    x_type = cnp.PyArray_TYPE(x_arr)
+
+    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
+        pass
+    else:
+        # we must cast the input and allocate the output,
+        # so we cast to complex double and operate in place
+        try:
+            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
+                x_arr, cnp.NPY_DOUBLE, cnp.NPY_BEHAVED | cnp.NPY_ENSURECOPY)
+        except:
+            raise ValueError(
+                "First argument should be a real "
+                "or a complex sequence of single or double precision"
+            )
+        x_type = cnp.PyArray_TYPE(x_arr)
+        in_place = 1
 
+    # need to convert this into complex array
+    rc_obj = _rr_to_rc(x_arr, n_, axis_, xnd, x_type)
+    rc_arr = <cnp.ndarray> rc_obj
 
-def rfft2(x, s=None, axes=(-2, -1), fwd_scale=1.0, out=None):
-    return rfftn(x, s=s, axes=axes, fwd_scale=fwd_scale, out=out)
+    rc_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
+    in_place = False
+    if in_place:
+        f_arr = x_arr
+    else:
+        f_arr = _allocate_result(x_arr, n_, axis_, x_type)
 
+    # call out-of-place FFT
+    if rc_type is cnp.NPY_CFLOAT:
+        _cache_capsule = _tls_dfti_cache_capsule()
+        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
+            _cache_capsule, capsule_name
+        )
+        status = cfloat_float_mkl_irfft_out(
+            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
+        )
+    elif rc_type is cnp.NPY_CDOUBLE:
+        _cache_capsule = _tls_dfti_cache_capsule()
+        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
+            _cache_capsule, capsule_name
+        )
+        status = cdouble_double_mkl_irfft_out(
+            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
+        )
+    else:
+        raise ValueError(
+            "Internal mkl_fft error occurred: Unrecognized rc_type"
+        )
 
-def irfft2(x, s=None, axes=(-2, -1), fwd_scale=1.0, out=None):
-    return irfftn(x, s=s, axes=axes, fwd_scale=fwd_scale, out=out)
+    if (status):
+        c_error_msg = mkl_dfti_error(status)
+        py_error_msg = c_error_msg
+        raise ValueError(
+            "Internal error occurred: {}".format(str(py_error_msg))
+        )
 
+    return f_arr
 
-def _remove_axis(s, axes, axis_to_remove):
-    lens = len(s)
-    axes_normalized = tuple(lens + ai if ai < 0 else ai for ai in axes)
-    a2r = lens + axis_to_remove if axis_to_remove < 0 else axis_to_remove
 
-    ss = s[:a2r] + s[a2r+1:]
-    pivot = axes_normalized[a2r]
-    aa = tuple(ai if ai < pivot else ai - 1 for ai in axes_normalized[:a2r]) + \
-        tuple(ai if ai < pivot else ai - 1 for ai in axes_normalized[a2r+1:])
-    return ss, aa
+def rfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
+    """Packed real-valued harmonics of FFT of a real sequence x"""
+    return _rr_fft1d_impl(
+        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
+    )
 
 
-cdef cnp.ndarray _trim_array(cnp.ndarray arr, object s, object axes):
-    """Forms a view into subarray of arr if any element of shape parameter s is
-    smaller than the corresponding element of the shape of the input array arr,
-    otherwise returns the input array"""
-    arr_shape = (<object> arr).shape
-    no_trim = True
-    for si, ai in zip(s, axes):
-        try:
-            shp_i = arr_shape[ai]
-        except IndexError:
-            raise ValueError("Invalid axis (%d) specified" % ai)
-        if si < shp_i:
-            if no_trim:
-                ind = [slice(None, None, None),] * len(arr_shape)
-            no_trim = False
-            ind[ai] = slice(None, si, None)
-    if no_trim:
-        return arr
-    return arr[tuple(ind)]
-
-
-def _fix_dimensions(cnp.ndarray arr, object s, object axes):
-    """Pads array arr with zeros to attain shape s associated with axes"""
-    arr_shape = (<object> arr).shape
-    no_trim = True
-    for si, ai in zip(s, axes):
-        try:
-            shp_i = arr_shape[ai]
-        except IndexError:
-            raise ValueError("Invalid axis (%d) specified" % ai)
-        if si > shp_i:
-            if no_trim:
-                pad_widths = [(0, 0),] * len(arr_shape)
-            no_trim = False
-            pad_widths[ai] = (0, si - shp_i)
-    if no_trim:
-        return arr
-    return np.pad(arr, tuple(pad_widths), "constant")
-
-
-def rfftn(x, s=None, axes=None, fwd_scale=1.0, out=None):
-    a = np.asarray(x)
-    no_trim = (s is None) and (axes is None)
-    s, axes = _cook_nd_args(a, s, axes)
-    la = axes[-1]
-    # trim array, so that rfft avoids doing unnecessary computations
-    if not no_trim:
-        a = _trim_array(a, s, axes)
-    # r2c along last axis
-    a = rfft(a, n = s[-1], axis=la, fwd_scale=fwd_scale, out=out)
-    if len(s) > 1:
-        if not no_trim:
-            ss = list(s)
-            ss[-1] = a.shape[la]
-            a = _fix_dimensions(a, tuple(ss), axes)
-        len_axes = len(axes)
-        if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
-            # a series of ND c2c FFTs along last axis
-            ss, aa = _remove_axis(s, axes, -1)
-            ind = [slice(None, None, 1),] * len(s)
-            for ii in range(a.shape[la]):
-                ind[la] = ii
-                tind = tuple(ind)
-                a_inp = a[tind]
-                res = out[tind] if out is not None else None
-                a_res = _fftnd_impl(
-                    a_inp, s=ss, axes=aa,
-                    overwrite_x=True, direction=1, out=res)
-                if a_res is not a_inp:
-                    a[tind] = a_res  # copy in place
-        else:
-            # a series of 1D c2c FFTs along all axes except last
-            for ii in range(len(axes) - 2, -1, -1):
-                a = fft(a, s[ii], axes[ii], overwrite_x=True)
-    return a
-
-
-def irfftn(x, s=None, axes=None, fwd_scale=1.0, out=None):
-    a = np.asarray(x)
-    no_trim = (s is None) and (axes is None)
-    s, axes = _cook_nd_args(a, s, axes, invreal=True)
-    la = axes[-1]
-    if not no_trim:
-        a = _trim_array(a, s, axes)
-    if len(s) > 1:
-        if not no_trim:
-            a = _fix_dimensions(a, s, axes)
-        ovr_x = True if _datacopied(<cnp.ndarray> a, x) else False
-        len_axes = len(axes)
-        if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
-            # a series of ND c2c FFTs along last axis
-            # due to need to write into a, we must copy
-            if not ovr_x:
-                a = a.copy()
-                ovr_x = True
-            if not np.issubdtype(a.dtype, np.complexfloating):
-                # complex output will be copied to input, copy is needed
-                if a.dtype == np.float32:
-                    a = a.astype(np.complex64)
-                else:
-                    a = a.astype(np.complex128)
-                ovr_x = True
-            ss, aa = _remove_axis(s, axes, -1)
-            ind = [slice(None, None, 1),] * len(s)
-            for ii in range(a.shape[la]):
-                ind[la] = ii
-                tind = tuple(ind)
-                a_inp = a[tind]
-                # out has real dtype and cannot be used in intermediate steps
-                a_res = _fftnd_impl(
-                    a_inp, s=ss, axes=aa,
-                    overwrite_x=True, direction=-1)
-                if a_res is not a_inp:
-                    a[tind] = a_res  # copy in place
-        else:
-            # a series of 1D c2c FFTs along all axes except last
-            for ii in range(len(axes)-1):
-                # out has real dtype and cannot be used in intermediate steps
-                a = ifft(a, s[ii], axes[ii], overwrite_x=ovr_x)
-                ovr_x = True
-    # c2r along last axis
-    a = irfft(a, n = s[-1], axis=la, fwd_scale=fwd_scale, out=out)
-    return a
+def irfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
+    """IFFT of a real sequence, takes packed real-valued harmonics of FFT"""
+    return _rr_ifft1d_impl(
+        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
+    )
diff --git a/mkl_fft/_float_utils.py b/mkl_fft/interfaces/_float_utils.py
similarity index 92%
rename from mkl_fft/_float_utils.py
rename to mkl_fft/interfaces/_float_utils.py
index cdfb4d6..19e044d 100644
--- a/mkl_fft/_float_utils.py
+++ b/mkl_fft/interfaces/_float_utils.py
@@ -26,16 +26,10 @@
 
 import numpy as np
 
-__all__ = [
-    "_upcast_float16_array",
-    "_downcast_float128_array",
-    "_supported_array_or_not_implemented",
-]
-
 
 def _upcast_float16_array(x):
     """
-    Used in _scipy_fft to upcast float16 to float32,
+    Used in _scipy_fftpack to upcast float16 to float32,
     instead of float64, as mkl_fft would do
     """
     if hasattr(x, "dtype"):
@@ -81,8 +75,8 @@ def _downcast_float128_array(x):
 
 def _supported_array_or_not_implemented(x):
     """
-    Used in _scipy_fft to convert array to float32,
-    float64, complex64, or complex128 type or raise NotImplementedError
+    Used in _scipy_fft to convert array to float32, float64, complex64,
+    or complex128 type or raise NotImplementedError
     """
     _x = np.asarray(x)
     black_list = [np.half]
diff --git a/mkl_fft/_numpy_fft.py b/mkl_fft/interfaces/_numpy_fft.py
similarity index 96%
rename from mkl_fft/_numpy_fft.py
rename to mkl_fft/interfaces/_numpy_fft.py
index 9ea1203..8557213 100644
--- a/mkl_fft/_numpy_fft.py
+++ b/mkl_fft/interfaces/_numpy_fft.py
@@ -29,32 +29,33 @@
 in the backend.
 """
 
+import re
+import warnings
+
+import numpy as np
+
+import mkl_fft
+
+from .._fft_utils import _compute_fwd_scale, _swap_direction
+from ._float_utils import _downcast_float128_array
+
 __all__ = [
     "fft",
     "ifft",
-    "rfft",
-    "irfft",
-    "hfft",
-    "ihfft",
-    "rfftn",
-    "irfftn",
-    "rfft2",
-    "irfft2",
     "fft2",
     "ifft2",
     "fftn",
     "ifftn",
+    "rfft",
+    "irfft",
+    "rfft2",
+    "irfft2",
+    "rfftn",
+    "irfftn",
+    "hfft",
+    "ihfft",
 ]
 
-import re
-import warnings
-
-import numpy as np
-
-from . import _pydfti as mkl_fft  # pylint: disable=no-name-in-module
-from ._fft_utils import _compute_fwd_scale, _swap_direction
-from ._float_utils import _downcast_float128_array
-
 
 # copied with modifications from:
 # https://github.com/numpy/numpy/blob/main/numpy/fft/_pocketfft.py
@@ -100,7 +101,7 @@ def _cook_nd_args(a, s=None, axes=None, invreal=False):
     return s, axes
 
 
-def trycall(func, args, kwrds):
+def _trycall(func, args, kwrds):
     try:
         res = func(*args, **kwrds)
     except ValueError as ve:
@@ -121,7 +122,7 @@ def fft(a, n=None, axis=-1, norm=None, out=None):
     x = _downcast_float128_array(a)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
-    return trycall(
+    return _trycall(
         mkl_fft.fft, (x,), {"n": n, "axis": axis, "fwd_scale": fsc, "out": out}
     )
 
@@ -136,135 +137,115 @@ def ifft(a, n=None, axis=-1, norm=None, out=None):
     x = _downcast_float128_array(a)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
-    return trycall(
+    return _trycall(
         mkl_fft.ifft, (x,), {"n": n, "axis": axis, "fwd_scale": fsc, "out": out}
     )
 
 
-def rfft(a, n=None, axis=-1, norm=None, out=None):
+def fft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
-    Compute the one-dimensional discrete Fourier Transform for real input.
+    Compute the 2-dimensional discrete Fourier Transform
 
-    For full documentation refer to `numpy.fft.rfft`.
+    For full documentation refer to `numpy.fft.fft2`.
 
     """
-    x = _downcast_float128_array(a)
-    fsc = _compute_fwd_scale(norm, n, x.shape[axis])
-
-    return trycall(
-        mkl_fft.rfft, (x,), {"n": n, "axis": axis, "fwd_scale": fsc, "out": out}
-    )
+    return fftn(a, s=s, axes=axes, norm=norm, out=out)
 
 
-def irfft(a, n=None, axis=-1, norm=None, out=None):
+def ifft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
-    Compute the inverse of `rfft`.
+    Compute the 2-dimensional inverse discrete Fourier Transform.
 
-    For full documentation refer to `numpy.fft.irfft`.
+    For full documentation refer to `numpy.fft.ifft2`.
 
     """
-    x = _downcast_float128_array(a)
-    fsc = _compute_fwd_scale(norm, n, 2 * (x.shape[axis] - 1))
-
-    return trycall(
-        mkl_fft.irfft,
-        (x,),
-        {"n": n, "axis": axis, "fwd_scale": fsc, "out": out},
-    )
+    return ifftn(a, s=s, axes=axes, norm=norm, out=out)
 
 
-def hfft(a, n=None, axis=-1, norm=None, out=None):
+def fftn(a, s=None, axes=None, norm=None, out=None):
     """
-    Compute the FFT of a signal which has Hermitian symmetry,
-    i.e., a real spectrum..
+    Compute the N-dimensional discrete Fourier Transform.
 
-    For full documentation refer to `numpy.fft.hfft`.
+    For full documentation refer to `numpy.fft.fftn`.
 
     """
-    norm = _swap_direction(norm)
     x = _downcast_float128_array(a)
-    x = np.array(x, copy=True)
-    np.conjugate(x, out=x)
-    fsc = _compute_fwd_scale(norm, n, 2 * (x.shape[axis] - 1))
+    s, axes = _cook_nd_args(x, s, axes)
+    fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    return trycall(
-        mkl_fft.irfft,
-        (x,),
-        {"n": n, "axis": axis, "fwd_scale": fsc, "out": out},
+    return _trycall(
+        mkl_fft.fftn, (x,), {"s": s, "axes": axes, "fwd_scale": fsc, "out": out}
     )
 
 
-def ihfft(a, n=None, axis=-1, norm=None, out=None):
+def ifftn(a, s=None, axes=None, norm=None, out=None):
     """
-    Compute the inverse FFT of a signal which has Hermitian symmetry.
+    Compute the N-dimensional inverse discrete Fourier Transform.
 
-    For full documentation refer to `numpy.fft.ihfft`.
+    For full documentation refer to `numpy.fft.ifftn`.
 
     """
-    norm = _swap_direction(norm)
     x = _downcast_float128_array(a)
-    fsc = _compute_fwd_scale(norm, n, x.shape[axis])
+    s, axes = _cook_nd_args(x, s, axes)
+    fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    output = trycall(
-        mkl_fft.rfft, (x,), {"n": n, "axis": axis, "fwd_scale": fsc, "out": out}
+    return _trycall(
+        mkl_fft.ifftn,
+        (x,),
+        {"s": s, "axes": axes, "fwd_scale": fsc, "out": out},
     )
 
-    np.conjugate(output, out=output)
-    return output
-
 
-def fftn(a, s=None, axes=None, norm=None, out=None):
+def rfft(a, n=None, axis=-1, norm=None, out=None):
     """
-    Compute the N-dimensional discrete Fourier Transform.
+    Compute the one-dimensional discrete Fourier Transform for real input.
 
-    For full documentation refer to `numpy.fft.fftn`.
+    For full documentation refer to `numpy.fft.rfft`.
 
     """
     x = _downcast_float128_array(a)
-    s, axes = _cook_nd_args(x, s, axes)
-    fsc = _compute_fwd_scale(norm, s, x.shape)
+    fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
-    return trycall(
-        mkl_fft.fftn, (x,), {"s": s, "axes": axes, "fwd_scale": fsc, "out": out}
+    return _trycall(
+        mkl_fft.rfft, (x,), {"n": n, "axis": axis, "fwd_scale": fsc, "out": out}
     )
 
 
-def ifftn(a, s=None, axes=None, norm=None, out=None):
+def irfft(a, n=None, axis=-1, norm=None, out=None):
     """
-    Compute the N-dimensional inverse discrete Fourier Transform.
+    Compute the inverse of `rfft`.
 
-    For full documentation refer to `numpy.fft.ifftn`.
+    For full documentation refer to `numpy.fft.irfft`.
 
     """
     x = _downcast_float128_array(a)
-    s, axes = _cook_nd_args(x, s, axes)
-    fsc = _compute_fwd_scale(norm, s, x.shape)
+    fsc = _compute_fwd_scale(norm, n, 2 * (x.shape[axis] - 1))
 
-    return trycall(
-        mkl_fft.ifftn,
+    return _trycall(
+        mkl_fft.irfft,
         (x,),
-        {"s": s, "axes": axes, "fwd_scale": fsc, "out": out},
+        {"n": n, "axis": axis, "fwd_scale": fsc, "out": out},
     )
 
 
-def fft2(a, s=None, axes=(-2, -1), norm=None, out=None):
+def rfft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
-    Compute the 2-dimensional discrete Fourier Transform
+    Compute the 2-dimensional FFT of a real array.
 
-    For full documentation refer to `numpy.fft.fft2`.
+    For full documentation refer to `numpy.fft.rfft2`.
 
     """
-    return fftn(a, s=s, axes=axes, norm=norm, out=out)
+    return rfftn(a, s=s, axes=axes, norm=norm, out=out)
 
 
-def ifft2(a, s=None, axes=(-2, -1), norm=None, out=None):
+def irfft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
-    Compute the 2-dimensional inverse discrete Fourier Transform.
+    Compute the inverse FFT of `rfft2`.
 
-    For full documentation refer to `numpy.fft.ifft2`.
+    For full documentation refer to `numpy.fft.irfft2`.
 
     """
-    return ifftn(a, s=s, axes=axes, norm=norm, out=out)
+    return irfftn(a, s=s, axes=axes, norm=norm, out=out)
 
 
 def rfftn(a, s=None, axes=None, norm=None, out=None):
@@ -278,23 +259,13 @@ def rfftn(a, s=None, axes=None, norm=None, out=None):
     s, axes = _cook_nd_args(x, s, axes)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    return trycall(
+    return _trycall(
         mkl_fft.rfftn,
         (x,),
         {"s": s, "axes": axes, "fwd_scale": fsc, "out": out},
     )
 
 
-def rfft2(a, s=None, axes=(-2, -1), norm=None, out=None):
-    """
-    Compute the 2-dimensional FFT of a real array.
-
-    For full documentation refer to `numpy.fft.rfft2`.
-
-    """
-    return rfftn(a, s=s, axes=axes, norm=norm, out=out)
-
-
 def irfftn(a, s=None, axes=None, norm=None, out=None):
     """
     Compute the inverse of `rfftn`.
@@ -307,18 +278,48 @@ def irfftn(a, s=None, axes=None, norm=None, out=None):
     s, axes = _cook_nd_args(x, s, axes, invreal=True)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    return trycall(
+    return _trycall(
         mkl_fft.irfftn,
         (x,),
         {"s": s, "axes": axes, "fwd_scale": fsc, "out": out},
     )
 
 
-def irfft2(a, s=None, axes=(-2, -1), norm=None, out=None):
+def hfft(a, n=None, axis=-1, norm=None, out=None):
     """
-    Compute the inverse FFT of `rfft2`.
+    Compute the FFT of a signal which has Hermitian symmetry,
+    i.e., a real spectrum..
 
-    For full documentation refer to `numpy.fft.irfft2`.
+    For full documentation refer to `numpy.fft.hfft`.
 
     """
-    return irfftn(a, s=s, axes=axes, norm=norm, out=out)
+    norm = _swap_direction(norm)
+    x = _downcast_float128_array(a)
+    x = np.array(x, copy=True)
+    np.conjugate(x, out=x)
+    fsc = _compute_fwd_scale(norm, n, 2 * (x.shape[axis] - 1))
+
+    return _trycall(
+        mkl_fft.irfft,
+        (x,),
+        {"n": n, "axis": axis, "fwd_scale": fsc, "out": out},
+    )
+
+
+def ihfft(a, n=None, axis=-1, norm=None, out=None):
+    """
+    Compute the inverse FFT of a signal which has Hermitian symmetry.
+
+    For full documentation refer to `numpy.fft.ihfft`.
+
+    """
+    norm = _swap_direction(norm)
+    x = _downcast_float128_array(a)
+    fsc = _compute_fwd_scale(norm, n, x.shape[axis])
+
+    output = _trycall(
+        mkl_fft.rfft, (x,), {"n": n, "axis": axis, "fwd_scale": fsc, "out": out}
+    )
+
+    np.conjugate(output, out=output)
+    return output
diff --git a/mkl_fft/_scipy_fft.py b/mkl_fft/interfaces/_scipy_fft.py
similarity index 94%
rename from mkl_fft/_scipy_fft.py
rename to mkl_fft/interfaces/_scipy_fft.py
index 7a5f658..e310da1 100644
--- a/mkl_fft/_scipy_fft.py
+++ b/mkl_fft/interfaces/_scipy_fft.py
@@ -37,10 +37,36 @@
 import mkl
 import numpy as np
 
-from . import _pydfti as mkl_fft  # pylint: disable=no-name-in-module
-from ._fft_utils import _compute_fwd_scale, _swap_direction
+import mkl_fft
+
+from .._fft_utils import _compute_fwd_scale, _swap_direction
 from ._float_utils import _supported_array_or_not_implemented
 
+__all__ = [
+    "fft",
+    "ifft",
+    "fft2",
+    "ifft2",
+    "fftn",
+    "ifftn",
+    "rfft",
+    "irfft",
+    "rfft2",
+    "irfft2",
+    "rfftn",
+    "irfftn",
+    "hfft",
+    "ihfft",
+    "hfft2",
+    "ihfft2",
+    "hfftn",
+    "ihfftn",
+    "get_workers",
+    "set_workers",
+    "DftiBackend",
+]
+
+
 __doc__ = """
 This module implements interfaces mimicking `scipy.fft` module.
 
@@ -49,12 +75,35 @@
 
 :Example:
     import scipy.fft
-    import mkl_fft._scipy_fft as be
+    import mkl_fft.interfaces._scipy_fft as mkl_be
     # Set mkl_fft to be used as backend of SciPy's FFT functions.
-    scipy.fft.set_global_backend(be)
+    scipy.fft.set_global_backend(mkl_be)
 """
 
 
+__ua_domain__ = "numpy.scipy.fft"
+
+
+def __ua_function__(method, args, kwargs):
+    """Fetch registered UA function."""
+    fn = globals().get(method.__name__, None)
+    if fn is None:
+        return NotImplemented
+    return fn(*args, **kwargs)
+
+
+class DftiBackend:
+    __ua_domain__ = "numpy.scipy.fft"
+
+    @staticmethod
+    def __ua_function__(method, args, kwargs):
+        """Fetch registered UA function."""
+        fn = globals().get(method.__name__, None)
+        if fn is None:
+            return NotImplemented
+        return fn(*args, **kwargs)
+
+
 class _cpu_max_threads_count:
     def __init__(self):
         self.cpu_count = None
@@ -96,74 +145,6 @@ def workers(self, workers_val):
 )
 
 
-def get_workers():
-    """Gets the number of workers used by mkl_fft by default"""
-    return _workers_global_settings.get().workers
-
-
-@contextlib.contextmanager
-def set_workers(n_workers):
-    """Set the value of workers used by default, returns the previous value"""
-    nw = operator.index(n_workers)
-    token = None
-    try:
-        new_wd = _workers_data(nw)
-        token = _workers_global_settings.set(new_wd)
-        yield
-    finally:
-        if token:
-            _workers_global_settings.reset(token)
-        else:
-            raise ValueError
-
-
-__all__ = [
-    "fft",
-    "ifft",
-    "fft2",
-    "ifft2",
-    "fftn",
-    "ifftn",
-    "rfft",
-    "irfft",
-    "rfft2",
-    "irfft2",
-    "rfftn",
-    "irfftn",
-    "hfft",
-    "ihfft",
-    "hfft2",
-    "ihfft2",
-    "hfftn",
-    "ihfftn",
-    "get_workers",
-    "set_workers",
-    "DftiBackend",
-]
-
-__ua_domain__ = "numpy.scipy.fft"
-
-
-def __ua_function__(method, args, kwargs):
-    """Fetch registered UA function."""
-    fn = globals().get(method.__name__, None)
-    if fn is None:
-        return NotImplemented
-    return fn(*args, **kwargs)
-
-
-class DftiBackend:
-    __ua_domain__ = "numpy.scipy.fft"
-
-    @staticmethod
-    def __ua_function__(method, args, kwargs):
-        """Fetch registered UA function."""
-        fn = globals().get(method.__name__, None)
-        if fn is None:
-            return NotImplemented
-        return fn(*args, **kwargs)
-
-
 def _workers_to_num_threads(w):
     """Handle conversion of workers to a positive number of threads in the
     same way as scipy.fft.helpers._workers.
@@ -184,7 +165,7 @@ def _workers_to_num_threads(w):
     return _w
 
 
-class Workers:
+class _Workers:
     def __init__(self, workers):
         self.workers = workers
         self.n_threads = _workers_to_num_threads(workers)
@@ -261,7 +242,7 @@ def fft(
     x = _validate_input(x)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.fft(
             x, n=n, axis=axis, overwrite_x=overwrite_x, fwd_scale=fsc
         )
@@ -280,7 +261,7 @@ def ifft(
     x = _validate_input(x)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.ifft(
             x, n=n, axis=axis, overwrite_x=overwrite_x, fwd_scale=fsc
         )
@@ -358,9 +339,10 @@ def fftn(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    s, axes = _cook_nd_args(x, s, axes)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.fftn(
             x, s=s, axes=axes, overwrite_x=overwrite_x, fwd_scale=fsc
         )
@@ -384,9 +366,10 @@ def ifftn(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    s, axes = _cook_nd_args(x, s, axes)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.ifftn(
             x, s=s, axes=axes, overwrite_x=overwrite_x, fwd_scale=fsc
         )
@@ -410,7 +393,7 @@ def rfft(
     x = _validate_input(x)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.rfft(x, n=n, axis=axis, fwd_scale=fsc)
 
 
@@ -432,7 +415,7 @@ def irfft(
     x = _validate_input(x)
     fsc = _compute_fwd_scale(norm, n, 2 * (x.shape[axis] - 1))
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.irfft(x, n=n, axis=axis, fwd_scale=fsc)
 
 
@@ -524,7 +507,7 @@ def rfftn(
     s, axes = _cook_nd_args(x, s, axes)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.rfftn(x, s, axes, fwd_scale=fsc)
 
 
@@ -554,7 +537,7 @@ def irfftn(
     s, axes = _cook_nd_args(x, s, axes, invreal=True)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.irfftn(x, s, axes, fwd_scale=fsc)
 
 
@@ -580,7 +563,7 @@ def hfft(
     np.conjugate(x, out=x)
     fsc = _compute_fwd_scale(norm, n, 2 * (x.shape[axis] - 1))
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.irfft(x, n=n, axis=axis, fwd_scale=fsc)
 
 
@@ -603,7 +586,7 @@ def ihfft(
     norm = _swap_direction(norm)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
-    with Workers(workers):
+    with _Workers(workers):
         result = mkl_fft.rfft(x, n=n, axis=axis, fwd_scale=fsc)
 
     np.conjugate(result, out=result)
@@ -683,7 +666,8 @@ def hfftn(
     plan=None,
 ):
     """
-    Compute the N-D FFT of Hermitian symmetric complex input, i.e., a signal with a real spectrum.
+    Compute the N-D FFT of Hermitian symmetric complex input,
+    i.e., a signal with a real spectrum.
 
     For full documentation refer to `scipy.fft.hfftn`.
 
@@ -701,7 +685,7 @@ def hfftn(
     s, axes = _cook_nd_args(x, s, axes, invreal=True)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    with Workers(workers):
+    with _Workers(workers):
         return mkl_fft.irfftn(x, s, axes, fwd_scale=fsc)
 
 
@@ -732,8 +716,39 @@ def ihfftn(
     s, axes = _cook_nd_args(x, s, axes)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
-    with Workers(workers):
+    with _Workers(workers):
         result = mkl_fft.rfftn(x, s, axes, fwd_scale=fsc)
 
     np.conjugate(result, out=result)
     return result
+
+
+def get_workers():
+    """
+    Gets the number of workers used by mkl_fft by default.
+
+    For full documentation refer to `scipy.fft.get_workers`.
+
+    """
+    return _workers_global_settings.get().workers
+
+
+@contextlib.contextmanager
+def set_workers(n_workers):
+    """
+    Set the value of workers used by default, returns the previous value.
+
+    For full documentation refer to `scipy.fft.set_workers`.
+
+    """
+    nw = operator.index(n_workers)
+    token = None
+    try:
+        new_wd = _workers_data(nw)
+        token = _workers_global_settings.set(new_wd)
+        yield
+    finally:
+        if token:
+            _workers_global_settings.reset(token)
+        else:
+            raise ValueError
diff --git a/mkl_fft/_scipy_fftpack.py b/mkl_fft/interfaces/_scipy_fftpack.py
similarity index 97%
rename from mkl_fft/_scipy_fftpack.py
rename to mkl_fft/interfaces/_scipy_fftpack.py
index 806dae0..a4cd27d 100644
--- a/mkl_fft/_scipy_fftpack.py
+++ b/mkl_fft/interfaces/_scipy_fftpack.py
@@ -24,7 +24,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from . import _pydfti as mkl_fft  # pylint: disable=no-name-in-module
+import mkl_fft
+
 from ._float_utils import _upcast_float16_array
 
 __all__ = ["fft", "ifft", "fftn", "ifftn", "fft2", "ifft2", "rfft", "irfft"]
diff --git a/mkl_fft/interfaces/numpy_fft.py b/mkl_fft/interfaces/numpy_fft.py
index 4dd420f..e30aa78 100644
--- a/mkl_fft/interfaces/numpy_fft.py
+++ b/mkl_fft/interfaces/numpy_fft.py
@@ -24,4 +24,4 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from .._numpy_fft import *
+from ._numpy_fft import *
diff --git a/mkl_fft/interfaces/scipy_fft.py b/mkl_fft/interfaces/scipy_fft.py
index a4dd615..0b77e4a 100644
--- a/mkl_fft/interfaces/scipy_fft.py
+++ b/mkl_fft/interfaces/scipy_fft.py
@@ -24,4 +24,4 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from .._scipy_fft import *
+from ._scipy_fft import *
diff --git a/mkl_fft/tests/test_fftnd.py b/mkl_fft/tests/test_fftnd.py
index f5a744d..ef54f1a 100644
--- a/mkl_fft/tests/test_fftnd.py
+++ b/mkl_fft/tests/test_fftnd.py
@@ -158,8 +158,9 @@ def test_matrix6(self):
                     s = container(d.shape)
                     kwargs = dict(s=s, axes=axes, norm=norm)
                     r_tol, a_tol = _get_rtol_atol(d)
-                    t = mkl_fft._numpy_fft.fftn(
-                        mkl_fft._numpy_fft.ifftn(d, **kwargs), **kwargs
+                    t = mkl_fft.interfaces.numpy_fft.fftn(
+                        mkl_fft.interfaces.numpy_fft.ifftn(d, **kwargs),
+                        **kwargs,
                     )
                     assert_allclose(
                         d,