diff --git a/quaddtype/README.md b/quaddtype/README.md
index e69de29b..60693cb9 100644
--- a/quaddtype/README.md
+++ b/quaddtype/README.md
@@ -0,0 +1,22 @@
+# Numpy-QuadDType
+
+## Installation
+
+```
+pip install numpy==2.1.0
+pip install -i https://test.pypi.org/simple/ quaddtype
+```
+
+## Usage
+
+```python
+import numpy as np
+from numpy_quaddtype import QuadPrecDType, QuadPrecision
+
+# using sleef backend (default)
+np.array([1,2,3], dtype=QuadPrecDType())
+np.array([1,2,3], dtype=QuadPrecDType("sleef"))
+
+# using longdouble backend
+np.array([1,2,3], dtype=QuadPrecDType("longdouble"))
+```
diff --git a/quaddtype/meson.build b/quaddtype/meson.build
index f1f9eab1..d6e651b8 100644
--- a/quaddtype/meson.build
+++ b/quaddtype/meson.build
@@ -1,4 +1,4 @@
-project('quaddtype', 'c', 'cpp', default_options : ['cpp_std=c++17', 'b_pie=true'])
+project('numpy_quaddtype', 'c', 'cpp', default_options : ['cpp_std=c++17', 'b_pie=true'])
 
 py_mod = import('python')
 py = py_mod.find_installation()
@@ -19,30 +19,33 @@ incdir_numpy = run_command(py,
 includes = include_directories(
     [
         incdir_numpy,
-        'quaddtype/src',
+        'numpy_quaddtype/src',
     ]
 )
 
 srcs = [
-    'quaddtype/src/casts.h',
-    'quaddtype/src/casts.cpp',
-    'quaddtype/src/scalar.h',
-    'quaddtype/src/scalar.c',
-    'quaddtype/src/dtype.h',
-    'quaddtype/src/dtype.c',
-    'quaddtype/src/quaddtype_main.c',
-    'quaddtype/src/scalar_ops.h',
-    'quaddtype/src/scalar_ops.cpp',
-    'quaddtype/src/ops.hpp',
-    'quaddtype/src/umath.h',
-    'quaddtype/src/umath.cpp'
+    'numpy_quaddtype/src/quad_common.h',
+    'numpy_quaddtype/src/casts.h',
+    'numpy_quaddtype/src/casts.cpp',
+    'numpy_quaddtype/src/scalar.h',
+    'numpy_quaddtype/src/scalar.c',
+    'numpy_quaddtype/src/dtype.h',
+    'numpy_quaddtype/src/dtype.c',
+    'numpy_quaddtype/src/quaddtype_main.c',
+    'numpy_quaddtype/src/scalar_ops.h',
+    'numpy_quaddtype/src/scalar_ops.cpp',
+    'numpy_quaddtype/src/ops.hpp',
+    'numpy_quaddtype/src/umath.h',
+    'numpy_quaddtype/src/umath.cpp',
+    'numpy_quaddtype/src/dragon4.h',
+    'numpy_quaddtype/src/dragon4.c'
 ]
 
 py.install_sources(
     [
-        'quaddtype/__init__.py',
+        'numpy_quaddtype/__init__.py',
     ],
-    subdir: 'quaddtype',
+    subdir: 'numpy_quaddtype',
     pure: false
 )
 
@@ -51,6 +54,6 @@ srcs,
 c_args: ['-g', '-O0', '-lsleef', '-lsleefquad'],
 dependencies: [sleef_dep, sleefquad_dep],
 install: true,
-subdir: 'quaddtype',
+subdir: 'numpy_quaddtype',
 include_directories: includes
 )
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/__init__.py b/quaddtype/numpy_quaddtype/__init__.py
new file mode 100644
index 00000000..e469a4c1
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/__init__.py
@@ -0,0 +1,34 @@
+from ._quaddtype_main import (
+    QuadPrecision,
+    QuadPrecDType,
+    is_longdouble_128,
+    get_sleef_constant
+)
+
+__all__ = [
+    'QuadPrecision', 'QuadPrecDType', 'SleefQuadPrecision', 'LongDoubleQuadPrecision',
+    'SleefQuadPrecDType', 'LongDoubleQuadPrecDType', 'is_longdouble_128', 'pi', 'e', 
+    'log2e', 'log10e', 'ln2', 'ln10', 'max_value', 'min_value', 'epsilon'
+]
+
+def SleefQuadPrecision(value):
+    return QuadPrecision(value, backend='sleef')
+
+def LongDoubleQuadPrecision(value):
+    return QuadPrecision(value, backend='longdouble')
+
+def SleefQuadPrecDType():
+    return QuadPrecDType(backend='sleef')
+
+def LongDoubleQuadPrecDType():
+    return QuadPrecDType(backend='longdouble')
+
+pi = get_sleef_constant("pi")
+e = get_sleef_constant("e")
+log2e = get_sleef_constant("log2e")
+log10e = get_sleef_constant("log10e")
+ln2 = get_sleef_constant("ln2")
+ln10 = get_sleef_constant("ln10")
+max_value = get_sleef_constant("quad_max")
+min_value = get_sleef_constant("quad_min")
+epsilon = get_sleef_constant("epsilon")
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/casts.cpp b/quaddtype/numpy_quaddtype/src/casts.cpp
new file mode 100644
index 00000000..c662b4d5
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/casts.cpp
@@ -0,0 +1,798 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+extern "C" {
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/dtype_api.h"
+}
+#include "sleef.h"
+#include "sleefquad.h"
+
+#include "quad_common.h"
+#include "scalar.h"
+#include "casts.h"
+#include "dtype.h"
+
+#define NUM_CASTS 29  // 14 to_casts + 14 from_casts + 1 quad_to_quad
+
+static NPY_CASTING
+quad_to_quad_resolve_descriptors(PyObject *NPY_UNUSED(self),
+                                 PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+                                 QuadPrecDTypeObject *given_descrs[2],
+                                 QuadPrecDTypeObject *loop_descrs[2], npy_intp *view_offset)
+{
+    NPY_CASTING casting = NPY_NO_CASTING;
+
+    if (given_descrs[0]->backend != given_descrs[1]->backend)
+        casting = NPY_UNSAFE_CASTING;
+
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        Py_INCREF(given_descrs[0]);
+        loop_descrs[1] = given_descrs[0];
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    *view_offset = 0;
+    return casting;
+}
+
+static int
+quad_to_quad_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                    npy_intp const dimensions[], npy_intp const strides[],
+                                    void *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)context->descriptors[1];
+
+    // inter-backend casting
+    if (descr_in->backend != descr_out->backend) {
+        while (N--) {
+            quad_value in_val, out_val;
+            if (descr_in->backend == BACKEND_SLEEF) {
+                memcpy(&in_val.sleef_value, in_ptr, sizeof(Sleef_quad));
+                out_val.longdouble_value = Sleef_cast_to_doubleq1(in_val.sleef_value);
+            }
+            else {
+                memcpy(&in_val.longdouble_value, in_ptr, sizeof(long double));
+                out_val.sleef_value = Sleef_cast_from_doubleq1(in_val.longdouble_value);
+            }
+            memcpy(out_ptr, &out_val,
+                   (descr_out->backend == BACKEND_SLEEF) ? sizeof(Sleef_quad)
+                                                         : sizeof(long double));
+            in_ptr += in_stride;
+            out_ptr += out_stride;
+        }
+
+        return 0;
+    }
+
+    size_t elem_size =
+            (descr_in->backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    while (N--) {
+        memcpy(out_ptr, in_ptr, elem_size);
+        in_ptr += in_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+static int
+quad_to_quad_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                  npy_intp const dimensions[], npy_intp const strides[],
+                                  void *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)context->descriptors[1];
+
+    // inter-backend casting
+    if (descr_in->backend != descr_out->backend) {
+        if (descr_in->backend == BACKEND_SLEEF) {
+            while (N--) {
+                Sleef_quad in_val = *(Sleef_quad *)in_ptr;
+                *(long double *)out_ptr = Sleef_cast_to_doubleq1(in_val);
+                in_ptr += in_stride;
+                out_ptr += out_stride;
+            }
+        }
+        else {
+            while (N--) {
+                long double in_val = *(long double *)in_ptr;
+                *(Sleef_quad *)out_ptr = Sleef_cast_from_doubleq1(in_val);
+                in_ptr += in_stride;
+                out_ptr += out_stride;
+            }
+        }
+
+        return 0;
+    }
+
+    if (descr_in->backend == BACKEND_SLEEF) {
+        while (N--) {
+            *(Sleef_quad *)out_ptr = *(Sleef_quad *)in_ptr;
+            in_ptr += in_stride;
+            out_ptr += out_stride;
+        }
+    }
+    else {
+        while (N--) {
+            *(long double *)out_ptr = *(long double *)in_ptr;
+            in_ptr += in_stride;
+            out_ptr += out_stride;
+        }
+    }
+
+    return 0;
+}
+
+// Casting from other types to QuadDType
+
+template <typename T>
+static inline quad_value
+to_quad(T x, QuadBackendType backend);
+
+template <>
+inline quad_value
+to_quad<npy_bool>(npy_bool x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = x ? Sleef_cast_from_doubleq1(1.0) : Sleef_cast_from_doubleq1(0.0);
+    }
+    else {
+        result.longdouble_value = x ? 1.0L : 0.0L;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_byte>(npy_byte x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_int64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_short>(npy_short x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_int64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_ushort>(npy_ushort x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_uint64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_int>(npy_int x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_int64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_uint>(npy_uint x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_uint64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_long>(npy_long x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_int64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_ulong>(npy_ulong x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_uint64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_longlong>(npy_longlong x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_int64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<npy_ulonglong>(npy_ulonglong x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_uint64q1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+template <>
+inline quad_value
+to_quad<float>(float x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_doubleq1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<double>(double x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_doubleq1(x);
+    }
+    else {
+        result.longdouble_value = (long double)x;
+    }
+    return result;
+}
+
+template <>
+inline quad_value
+to_quad<long double>(long double x, QuadBackendType backend)
+{
+    quad_value result;
+    if (backend == BACKEND_SLEEF) {
+        result.sleef_value = Sleef_cast_from_doubleq1(x);
+    }
+    else {
+        result.longdouble_value = x;
+    }
+    return result;
+}
+
+template <typename T>
+static NPY_CASTING
+numpy_to_quad_resolve_descriptors(PyObject *NPY_UNUSED(self), PyArray_DTypeMeta *dtypes[2],
+                                  PyArray_Descr *given_descrs[2], PyArray_Descr *loop_descrs[2],
+                                  npy_intp *view_offset)
+{
+    // todo: here it is converting this to SLEEF, losing data and getting 0
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = (PyArray_Descr *)new_quaddtype_instance(BACKEND_SLEEF);
+        if (loop_descrs[1] == nullptr) {
+            return (NPY_CASTING)-1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    loop_descrs[0] = PyArray_GetDefaultDescr(dtypes[0]);
+    return NPY_SAFE_CASTING;
+}
+
+template <typename T>
+static int
+numpy_to_quad_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                     npy_intp const dimensions[], npy_intp const strides[],
+                                     void *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+
+    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)context->descriptors[1];
+    QuadBackendType backend = descr_out->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    while (N--) {
+        T in_val;
+        quad_value out_val;
+
+        memcpy(&in_val, in_ptr, sizeof(T));
+        out_val = to_quad<T>(in_val, backend);
+        memcpy(out_ptr, &out_val, elem_size);
+
+        in_ptr += strides[0];
+        out_ptr += strides[1];
+    }
+    return 0;
+}
+
+template <typename T>
+static int
+numpy_to_quad_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                   npy_intp const dimensions[], npy_intp const strides[],
+                                   void *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+
+    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)context->descriptors[1];
+    QuadBackendType backend = descr_out->backend;
+
+    while (N--) {
+        T in_val = *(T *)in_ptr;
+        quad_value out_val = to_quad<T>(in_val, backend);
+
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)(out_ptr) = out_val.sleef_value;
+        }
+        else {
+            *(long double *)(out_ptr) = out_val.longdouble_value;
+        }
+
+        in_ptr += strides[0];
+        out_ptr += strides[1];
+    }
+    return 0;
+}
+
+// Casting from QuadDType to other types
+
+template <typename T>
+static inline T
+from_quad(quad_value x, QuadBackendType backend);
+
+template <>
+inline npy_bool
+from_quad<npy_bool>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return Sleef_cast_to_int64q1(x.sleef_value) != 0;
+    }
+    else {
+        return x.longdouble_value != 0;
+    }
+}
+
+template <>
+inline npy_byte
+from_quad<npy_byte>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (npy_byte)Sleef_cast_to_int64q1(x.sleef_value);
+    }
+    else {
+        return (npy_byte)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_short
+from_quad<npy_short>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (npy_short)Sleef_cast_to_int64q1(x.sleef_value);
+    }
+    else {
+        return (npy_short)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_ushort
+from_quad<npy_ushort>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (npy_ushort)Sleef_cast_to_uint64q1(x.sleef_value);
+    }
+    else {
+        return (npy_ushort)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_int
+from_quad<npy_int>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (npy_int)Sleef_cast_to_int64q1(x.sleef_value);
+    }
+    else {
+        return (npy_int)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_uint
+from_quad<npy_uint>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (npy_uint)Sleef_cast_to_uint64q1(x.sleef_value);
+    }
+    else {
+        return (npy_uint)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_long
+from_quad<npy_long>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (npy_long)Sleef_cast_to_int64q1(x.sleef_value);
+    }
+    else {
+        return (npy_long)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_ulong
+from_quad<npy_ulong>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (npy_ulong)Sleef_cast_to_uint64q1(x.sleef_value);
+    }
+    else {
+        return (npy_ulong)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_longlong
+from_quad<npy_longlong>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return Sleef_cast_to_int64q1(x.sleef_value);
+    }
+    else {
+        return (npy_longlong)x.longdouble_value;
+    }
+}
+
+template <>
+inline npy_ulonglong
+from_quad<npy_ulonglong>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return Sleef_cast_to_uint64q1(x.sleef_value);
+    }
+    else {
+        return (npy_ulonglong)x.longdouble_value;
+    }
+}
+
+template <>
+inline float
+from_quad<float>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (float)Sleef_cast_to_doubleq1(x.sleef_value);
+    }
+    else {
+        return (float)x.longdouble_value;
+    }
+}
+
+template <>
+inline double
+from_quad<double>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return Sleef_cast_to_doubleq1(x.sleef_value);
+    }
+    else {
+        return (double)x.longdouble_value;
+    }
+}
+
+template <>
+inline long double
+from_quad<long double>(quad_value x, QuadBackendType backend)
+{
+    if (backend == BACKEND_SLEEF) {
+        return (long double)Sleef_cast_to_doubleq1(x.sleef_value);
+    }
+    else {
+        return x.longdouble_value;
+    }
+}
+
+template <typename T>
+static NPY_CASTING
+quad_to_numpy_resolve_descriptors(PyObject *NPY_UNUSED(self), PyArray_DTypeMeta *dtypes[2],
+                                  PyArray_Descr *given_descrs[2], PyArray_Descr *loop_descrs[2],
+                                  npy_intp *view_offset)
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    loop_descrs[1] = PyArray_GetDefaultDescr(dtypes[1]);
+    return NPY_UNSAFE_CASTING;
+}
+
+template <typename T>
+static int
+quad_to_numpy_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                     npy_intp const dimensions[], npy_intp const strides[],
+                                     void *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+
+    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = quad_descr->backend;
+
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    while (N--) {
+        quad_value in_val;
+        memcpy(&in_val, in_ptr, elem_size);
+
+        T out_val = from_quad<T>(in_val, backend);
+        memcpy(out_ptr, &out_val, sizeof(T));
+
+        in_ptr += strides[0];
+        out_ptr += strides[1];
+    }
+    return 0;
+}
+
+template <typename T>
+static int
+quad_to_numpy_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                   npy_intp const dimensions[], npy_intp const strides[],
+                                   void *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+
+    QuadPrecDTypeObject *quad_descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = quad_descr->backend;
+
+    while (N--) {
+        quad_value in_val;
+        if (backend == BACKEND_SLEEF) {
+            in_val.sleef_value = *(Sleef_quad *)in_ptr;
+        }
+        else {
+            in_val.longdouble_value = *(long double *)in_ptr;
+        }
+
+        T out_val = from_quad<T>(in_val, backend);
+        *(T *)(out_ptr) = out_val;
+
+        in_ptr += strides[0];
+        out_ptr += strides[1];
+    }
+    return 0;
+}
+
+static PyArrayMethod_Spec *specs[NUM_CASTS + 1];  // +1 for NULL terminator
+static size_t spec_count = 0;
+
+void
+add_spec(PyArrayMethod_Spec *spec)
+{
+    if (spec_count < NUM_CASTS) {
+        specs[spec_count++] = spec;
+    }
+    else {
+        delete[] spec->dtypes;
+        delete[] spec->slots;
+        delete spec;
+    }
+}
+
+// functions to add casts
+template <typename T>
+void
+add_cast_from(PyArray_DTypeMeta *to)
+{
+    PyArray_DTypeMeta **dtypes = new PyArray_DTypeMeta *[2]{&QuadPrecDType, to};
+
+    PyType_Slot *slots = new PyType_Slot[]{
+            {NPY_METH_resolve_descriptors, (void *)&quad_to_numpy_resolve_descriptors<T>},
+            {NPY_METH_strided_loop, (void *)&quad_to_numpy_strided_loop_aligned<T>},
+            {NPY_METH_unaligned_strided_loop, (void *)&quad_to_numpy_strided_loop_unaligned<T>},
+            {0, nullptr}};
+
+    PyArrayMethod_Spec *spec = new PyArrayMethod_Spec{
+            .name = "cast_QuadPrec_to_NumPy",
+            .nin = 1,
+            .nout = 1,
+            .casting = NPY_UNSAFE_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+    add_spec(spec);
+}
+
+template <typename T>
+void
+add_cast_to(PyArray_DTypeMeta *from)
+{
+    PyArray_DTypeMeta **dtypes = new PyArray_DTypeMeta *[2]{from, &QuadPrecDType};
+
+    PyType_Slot *slots = new PyType_Slot[]{
+            {NPY_METH_resolve_descriptors, (void *)&numpy_to_quad_resolve_descriptors<T>},
+            {NPY_METH_strided_loop, (void *)&numpy_to_quad_strided_loop_aligned<T>},
+            {NPY_METH_unaligned_strided_loop, (void *)&numpy_to_quad_strided_loop_unaligned<T>},
+            {0, nullptr}};
+
+    PyArrayMethod_Spec *spec = new PyArrayMethod_Spec{
+            .name = "cast_NumPy_to_QuadPrec",
+            .nin = 1,
+            .nout = 1,
+            .casting = NPY_SAFE_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    add_spec(spec);
+}
+
+PyArrayMethod_Spec **
+init_casts_internal(void)
+{
+    PyArray_DTypeMeta **quad2quad_dtypes = new PyArray_DTypeMeta *[2]{nullptr, nullptr};
+    PyType_Slot *quad2quad_slots = new PyType_Slot[4]{
+            {NPY_METH_resolve_descriptors, (void *)&quad_to_quad_resolve_descriptors},
+            {NPY_METH_strided_loop, (void *)&quad_to_quad_strided_loop_aligned},
+            {NPY_METH_unaligned_strided_loop, (void *)&quad_to_quad_strided_loop_unaligned},
+            {0, nullptr}};
+
+    PyArrayMethod_Spec *quad2quad_spec = new PyArrayMethod_Spec{
+            .name = "cast_QuadPrec_to_QuadPrec",
+            .nin = 1,
+            .nout = 1,
+            .casting = NPY_UNSAFE_CASTING,  // since SLEEF -> ld might lose precision
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = quad2quad_dtypes,
+            .slots = quad2quad_slots,
+    };
+
+    add_spec(quad2quad_spec);
+
+    add_cast_to<npy_bool>(&PyArray_BoolDType);
+    add_cast_to<npy_byte>(&PyArray_ByteDType);
+    add_cast_to<npy_short>(&PyArray_ShortDType);
+    add_cast_to<npy_ushort>(&PyArray_UShortDType);
+    add_cast_to<npy_int>(&PyArray_IntDType);
+    add_cast_to<npy_uint>(&PyArray_UIntDType);
+    add_cast_to<npy_long>(&PyArray_LongDType);
+    add_cast_to<npy_ulong>(&PyArray_ULongDType);
+    add_cast_to<npy_longlong>(&PyArray_LongLongDType);
+    add_cast_to<npy_ulonglong>(&PyArray_ULongLongDType);
+    add_cast_to<float>(&PyArray_FloatDType);
+    add_cast_to<double>(&PyArray_DoubleDType);
+    add_cast_to<long double>(&PyArray_LongDoubleDType);
+
+    add_cast_from<npy_bool>(&PyArray_BoolDType);
+    add_cast_from<npy_byte>(&PyArray_ByteDType);
+    add_cast_from<npy_short>(&PyArray_ShortDType);
+    add_cast_from<npy_ushort>(&PyArray_UShortDType);
+    add_cast_from<npy_int>(&PyArray_IntDType);
+    add_cast_from<npy_uint>(&PyArray_UIntDType);
+    add_cast_from<npy_long>(&PyArray_LongDType);
+    add_cast_from<npy_ulong>(&PyArray_ULongDType);
+    add_cast_from<npy_longlong>(&PyArray_LongLongDType);
+    add_cast_from<npy_ulonglong>(&PyArray_ULongLongDType);
+    add_cast_from<float>(&PyArray_FloatDType);
+    add_cast_from<double>(&PyArray_DoubleDType);
+    add_cast_from<long double>(&PyArray_LongDoubleDType);
+
+    specs[spec_count] = nullptr;
+    return specs;
+}
+
+PyArrayMethod_Spec **
+init_casts(void)
+{
+    try {
+        return init_casts_internal();
+    }
+    catch (int e) {
+        PyErr_NoMemory();
+        return nullptr;
+    }
+}
+
+void
+free_casts(void)
+{
+    for (size_t i = 0; i < spec_count; i++) {
+        if (specs[i]) {
+            delete[] specs[i]->dtypes;
+            delete[] specs[i]->slots;
+            delete specs[i];
+            specs[i] = nullptr;
+        }
+    }
+    spec_count = 0;
+}
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/casts.h b/quaddtype/numpy_quaddtype/src/casts.h
similarity index 100%
rename from quaddtype/quaddtype/src/casts.h
rename to quaddtype/numpy_quaddtype/src/casts.h
diff --git a/quaddtype/numpy_quaddtype/src/dragon4.c b/quaddtype/numpy_quaddtype/src/dragon4.c
new file mode 100644
index 00000000..34ad4cbb
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/dragon4.c
@@ -0,0 +1,2024 @@
+/*
+This code was extracted from NumPy and the original author was Allan Haldane(@ahaldane)
+Modifications are specific to support the SLEEF_QUAD
+*/
+
+#include <numpy/npy_common.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <sleef.h>
+#include <sleefquad.h>
+
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+#include "dragon4.h"
+#include "dtype.h"
+#include "scalar.h"
+
+#if 0
+#define DEBUG_ASSERT(stmnt) assert(stmnt)
+#else
+#define DEBUG_ASSERT(stmnt) \
+    do {                    \
+    } while (0)
+#endif
+
+#define c_BigInt_MaxBlocks 1023
+#define BIGINT_DRAGON4_GROUPSIZE 7
+
+typedef struct BigInt {
+    npy_uint32 length;
+    npy_uint32 blocks[c_BigInt_MaxBlocks];
+} BigInt;
+
+typedef struct {
+    BigInt bigints[BIGINT_DRAGON4_GROUPSIZE];
+    char repr[16384];
+} Dragon4_Scratch;
+
+static NPY_TLS Dragon4_Scratch _bigint_static;
+
+static inline npy_uint64
+bitmask_u64(npy_uint32 n)
+{
+    return ~(~((npy_uint64)0) << n);
+}
+
+static inline npy_uint32
+bitmask_u32(npy_uint32 n)
+{
+    return ~(~((npy_uint32)0) << n);
+}
+
+/* result = result * 10 */
+static void
+BigInt_Multiply10(BigInt *result)
+{
+    /* multiply all the blocks */
+    npy_uint64 carry = 0;
+
+    npy_uint32 *cur = result->blocks;
+    npy_uint32 *end = result->blocks + result->length;
+    for (; cur != end; ++cur) {
+        npy_uint64 product = (npy_uint64)(*cur) * 10ull + carry;
+        (*cur) = (npy_uint32)(product & bitmask_u64(32));
+        carry = product >> 32;
+    }
+
+    if (carry != 0) {
+        /* grow the array */
+        DEBUG_ASSERT(result->length + 1 <= c_BigInt_MaxBlocks);
+        *cur = (npy_uint32)carry;
+        ++result->length;
+    }
+}
+
+static npy_uint32 g_PowerOf10_U32[] = {
+        1,        /* 10 ^ 0 */
+        10,       /* 10 ^ 1 */
+        100,      /* 10 ^ 2 */
+        1000,     /* 10 ^ 3 */
+        10000,    /* 10 ^ 4 */
+        100000,   /* 10 ^ 5 */
+        1000000,  /* 10 ^ 6 */
+        10000000, /* 10 ^ 7 */
+};
+
+/*
+ * Note: This has a lot of wasted space in the big integer structures of the
+ *       early table entries. It wouldn't be terribly hard to make the multiply
+ *       function work on integer pointers with an array length instead of
+ *       the BigInt struct which would allow us to store a minimal amount of
+ *       data here.
+ */
+static BigInt g_PowerOf10_Big[] = {
+        /* 10 ^ 8 */
+        {1, {100000000}},
+        /* 10 ^ 16 */
+        {2, {0x6fc10000, 0x002386f2}},
+        /* 10 ^ 32 */
+        {4,
+         {
+                 0x00000000,
+                 0x85acef81,
+                 0x2d6d415b,
+                 0x000004ee,
+         }},
+        /* 10 ^ 64 */
+        {7,
+         {
+                 0x00000000,
+                 0x00000000,
+                 0xbf6a1f01,
+                 0x6e38ed64,
+                 0xdaa797ed,
+                 0xe93ff9f4,
+                 0x00184f03,
+         }},
+        /* 10 ^ 128 */
+        {14,
+         {
+                 0x00000000,
+                 0x00000000,
+                 0x00000000,
+                 0x00000000,
+                 0x2e953e01,
+                 0x03df9909,
+                 0x0f1538fd,
+                 0x2374e42f,
+                 0xd3cff5ec,
+                 0xc404dc08,
+                 0xbccdb0da,
+                 0xa6337f19,
+                 0xe91f2603,
+                 0x0000024e,
+         }},
+        /* 10 ^ 256 */
+        {27,
+         {
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x982e7c01, 0xbed3875b, 0xd8d99f72, 0x12152f87, 0x6bde50c6, 0xcf4a6e70,
+                 0xd595d80f, 0x26b2716e, 0xadc666b0, 0x1d153624, 0x3c42d35a, 0x63ff540e, 0xcc5573c0,
+                 0x65f9ef17, 0x55bc28f2, 0x80dcc7f7, 0xf46eeddc, 0x5fdcefce, 0x000553f7,
+         }},
+        /* 10 ^ 512 */
+        {54,
+         {
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0xfc6cf801, 0x77f27267, 0x8f9546dc, 0x5d96976f, 0xb83a8a97,
+                 0xc31e1ad9, 0x46c40513, 0x94e65747, 0xc88976c1, 0x4475b579, 0x28f8733b, 0xaa1da1bf,
+                 0x703ed321, 0x1e25cfea, 0xb21a2f22, 0xbc51fb2e, 0x96e14f5d, 0xbfa3edac, 0x329c57ae,
+                 0xe7fc7153, 0xc3fc0695, 0x85a91924, 0xf95f635e, 0xb2908ee0, 0x93abade4, 0x1366732a,
+                 0x9449775c, 0x69be5b0e, 0x7343afac, 0xb099bc81, 0x45a71d46, 0xa2699748, 0x8cb07303,
+                 0x8a0b1f13, 0x8cab8a97, 0xc1d238d9, 0x633415d4, 0x0000001c,
+         }},
+        /* 10 ^ 1024 */
+        {107,
+         {
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x2919f001, 0xf55b2b72, 0x6e7c215b,
+                 0x1ec29f86, 0x991c4e87, 0x15c51a88, 0x140ac535, 0x4c7d1e1a, 0xcc2cd819, 0x0ed1440e,
+                 0x896634ee, 0x7de16cfb, 0x1e43f61f, 0x9fce837d, 0x231d2b9c, 0x233e55c7, 0x65dc60d7,
+                 0xf451218b, 0x1c5cd134, 0xc9635986, 0x922bbb9f, 0xa7e89431, 0x9f9f2a07, 0x62be695a,
+                 0x8e1042c4, 0x045b7a74, 0x1abe1de3, 0x8ad822a5, 0xba34c411, 0xd814b505, 0xbf3fdeb3,
+                 0x8fc51a16, 0xb1b896bc, 0xf56deeec, 0x31fb6bfd, 0xb6f4654b, 0x101a3616, 0x6b7595fb,
+                 0xdc1a47fe, 0x80d98089, 0x80bda5a5, 0x9a202882, 0x31eb0f66, 0xfc8f1f90, 0x976a3310,
+                 0xe26a7b7e, 0xdf68368a, 0x3ce3a0b8, 0x8e4262ce, 0x75a351a2, 0x6cb0b6c9, 0x44597583,
+                 0x31b5653f, 0xc356e38a, 0x35faaba6, 0x0190fba0, 0x9fc4ed52, 0x88bc491b, 0x1640114a,
+                 0x005b8041, 0xf4f3235e, 0x1e8d4649, 0x36a8de06, 0x73c55349, 0xa7e6bd2a, 0xc1a6970c,
+                 0x47187094, 0xd2db49ef, 0x926c3f5b, 0xae6209d4, 0x2d433949, 0x34f4a3c6, 0xd4305d94,
+                 0xd9d61a05, 0x00000325,
+         }},
+        /* 10 ^ 2048 */
+        {213,
+         {
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x1333e001, 0xe3096865, 0xb27d4d3f, 0x49e28dcf, 0xec2e4721, 0xee87e354,
+                 0xb6067584, 0x368b8abb, 0xa5e5a191, 0x2ed56d55, 0xfd827773, 0xea50d142, 0x51b78db2,
+                 0x98342c9e, 0xc850dabc, 0x866ed6f1, 0x19342c12, 0x92794987, 0xd2f869c2, 0x66912e4a,
+                 0x71c7fd8f, 0x57a7842d, 0x235552eb, 0xfb7fedcc, 0xf3861ce0, 0x38209ce1, 0x9713b449,
+                 0x34c10134, 0x8c6c54de, 0xa7a8289c, 0x2dbb6643, 0xe3cb64f3, 0x8074ff01, 0xe3892ee9,
+                 0x10c17f94, 0xa8f16f92, 0xa8281ed6, 0x967abbb3, 0x5a151440, 0x9952fbed, 0x13b41e44,
+                 0xafe609c3, 0xa2bca416, 0xf111821f, 0xfb1264b4, 0x91bac974, 0xd6c7d6ab, 0x8e48ff35,
+                 0x4419bd43, 0xc4a65665, 0x685e5510, 0x33554c36, 0xab498697, 0x0dbd21fe, 0x3cfe491d,
+                 0x982da466, 0xcbea4ca7, 0x9e110c7b, 0x79c56b8a, 0x5fc5a047, 0x84d80e2e, 0x1aa9f444,
+                 0x730f203c, 0x6a57b1ab, 0xd752f7a6, 0x87a7dc62, 0x944545ff, 0x40660460, 0x77c1a42f,
+                 0xc9ac375d, 0xe866d7ef, 0x744695f0, 0x81428c85, 0xa1fc6b96, 0xd7917c7b, 0x7bf03c19,
+                 0x5b33eb41, 0x5715f791, 0x8f6cae5f, 0xdb0708fd, 0xb125ac8e, 0x785ce6b7, 0x56c6815b,
+                 0x6f46eadb, 0x4eeebeee, 0x195355d8, 0xa244de3c, 0x9d7389c0, 0x53761abd, 0xcf99d019,
+                 0xde9ec24b, 0x0d76ce39, 0x70beb181, 0x2e55ecee, 0xd5f86079, 0xf56d9d4b, 0xfb8886fb,
+                 0x13ef5a83, 0x408f43c5, 0x3f3389a4, 0xfad37943, 0x58ccf45c, 0xf82df846, 0x415c7f3e,
+                 0x2915e818, 0x8b3d5cf4, 0x6a445f27, 0xf8dbb57a, 0xca8f0070, 0x8ad803ec, 0xb2e87c34,
+                 0x038f9245, 0xbedd8a6c, 0xc7c9dee0, 0x0eac7d56, 0x2ad3fa14, 0xe0de0840, 0xf775677c,
+                 0xf1bd0ad5, 0x92be221e, 0x87fa1fb9, 0xce9d04a4, 0xd2c36fa9, 0x3f6f7024, 0xb028af62,
+                 0x907855ee, 0xd83e49d6, 0x4efac5dc, 0xe7151aab, 0x77cd8c6b, 0x0a753b7d, 0x0af908b4,
+                 0x8c983623, 0xe50f3027, 0x94222771, 0x1d08e2d6, 0xf7e928e6, 0xf2ee5ca6, 0x1b61b93c,
+                 0x11eb962b, 0x9648b21c, 0xce2bcba1, 0x34f77154, 0x7bbebe30, 0xe526a319, 0x8ce329ac,
+                 0xde4a74d2, 0xb5dc53d5, 0x0009e8b3,
+         }},
+        /* 10 ^ 4096 */
+        {426,
+         {
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                 0x00000000, 0x00000000, 0x2a67c001, 0xd4724e8d, 0x8efe7ae7, 0xf89a1e90, 0xef084117,
+                 0x54e05154, 0x13b1bb51, 0x506be829, 0xfb29b172, 0xe599574e, 0xf0da6146, 0x806c0ed3,
+                 0xb86ae5be, 0x45155e93, 0xc0591cc2, 0x7e1e7c34, 0x7c4823da, 0x1d1f4cce, 0x9b8ba1e8,
+                 0xd6bfdf75, 0xe341be10, 0xc2dfae78, 0x016b67b2, 0x0f237f1a, 0x3dbeabcd, 0xaf6a2574,
+                 0xcab3e6d7, 0x142e0e80, 0x61959127, 0x2c234811, 0x87009701, 0xcb4bf982, 0xf8169c84,
+                 0x88052f8c, 0x68dde6d4, 0xbc131761, 0xff0b0905, 0x54ab9c41, 0x7613b224, 0x1a1c304e,
+                 0x3bfe167b, 0x441c2d47, 0x4f6cea9c, 0x78f06181, 0xeb659fb8, 0x30c7ae41, 0x947e0d0e,
+                 0xa1ebcad7, 0xd97d9556, 0x2130504d, 0x1a8309cb, 0xf2acd507, 0x3f8ec72a, 0xfd82373a,
+                 0x95a842bc, 0x280f4d32, 0xf3618ac0, 0x811a4f04, 0x6dc3a5b4, 0xd3967a1b, 0x15b8c898,
+                 0xdcfe388f, 0x454eb2a0, 0x8738b909, 0x10c4e996, 0x2bd9cc11, 0x3297cd0c, 0x655fec30,
+                 0xae0725b1, 0xf4090ee8, 0x037d19ee, 0x398c6fed, 0x3b9af26b, 0xc994a450, 0xb5341743,
+                 0x75a697b2, 0xac50b9c1, 0x3ccb5b92, 0xffe06205, 0xa8329761, 0xdfea5242, 0xeb83cadb,
+                 0xe79dadf7, 0x3c20ee69, 0x1e0a6817, 0x7021b97a, 0x743074fa, 0x176ca776, 0x77fb8af6,
+                 0xeca19beb, 0x92baf1de, 0xaf63b712, 0xde35c88b, 0xa4eb8f8c, 0xe137d5e9, 0x40b464a0,
+                 0x87d1cde8, 0x42923bbd, 0xcd8f62ff, 0x2e2690f3, 0x095edc16, 0x59c89f1b, 0x1fa8fd5d,
+                 0x5138753d, 0x390a2b29, 0x80152f18, 0x2dd8d925, 0xf984d83e, 0x7a872e74, 0xc19e1faf,
+                 0xed4d542d, 0xecf9b5d0, 0x9462ea75, 0xc53c0adf, 0x0caea134, 0x37a2d439, 0xc8fa2e8a,
+                 0x2181327e, 0x6e7bb827, 0x2d240820, 0x50be10e0, 0x5893d4b8, 0xab312bb9, 0x1f2b2322,
+                 0x440b3f25, 0xbf627ede, 0x72dac789, 0xb608b895, 0x78787e2a, 0x86deb3f0, 0x6fee7aab,
+                 0xbb9373f4, 0x27ecf57b, 0xf7d8b57e, 0xfca26a9f, 0x3d04e8d2, 0xc9df13cb, 0x3172826a,
+                 0xcd9e8d7c, 0xa8fcd8e0, 0xb2c39497, 0x307641d9, 0x1cc939c1, 0x2608c4cf, 0xb6d1c7bf,
+                 0x3d326a7e, 0xeeaf19e6, 0x8e13e25f, 0xee63302b, 0x2dfe6d97, 0x25971d58, 0xe41d3cc4,
+                 0x0a80627c, 0xab8db59a, 0x9eea37c8, 0xe90afb77, 0x90ca19cf, 0x9ee3352c, 0x3613c850,
+                 0xfe78d682, 0x788f6e50, 0x5b060904, 0xb71bd1a4, 0x3fecb534, 0xb32c450c, 0x20c33857,
+                 0xa6e9cfda, 0x0239f4ce, 0x48497187, 0xa19adb95, 0xb492ed8a, 0x95aca6a8, 0x4dcd6cd9,
+                 0xcf1b2350, 0xfbe8b12a, 0x1a67778c, 0x38eb3acc, 0xc32da383, 0xfb126ab1, 0xa03f40a8,
+                 0xed5bf546, 0xe9ce4724, 0x4c4a74fd, 0x73a130d8, 0xd9960e2d, 0xa2ebd6c1, 0x94ab6feb,
+                 0x6f233b7c, 0x49126080, 0x8e7b9a73, 0x4b8c9091, 0xd298f999, 0x35e836b5, 0xa96ddeff,
+                 0x96119b31, 0x6b0dd9bc, 0xc6cc3f8d, 0x282566fb, 0x72b882e7, 0xd6769f3b, 0xa674343d,
+                 0x00fc509b, 0xdcbf7789, 0xd6266a3f, 0xae9641fd, 0x4e89541b, 0x11953407, 0x53400d03,
+                 0x8e0dd75a, 0xe5b53345, 0x108f19ad, 0x108b89bc, 0x41a4c954, 0xe03b2b63, 0x437b3d7f,
+                 0x97aced8e, 0xcbd66670, 0x2c5508c2, 0x650ebc69, 0x5c4f2ef0, 0x904ff6bf, 0x9985a2df,
+                 0x9faddd9e, 0x5ed8d239, 0x25585832, 0xe3e51cb9, 0x0ff4f1d4, 0x56c02d9a, 0x8c4ef804,
+                 0xc1a08a13, 0x13fd01c8, 0xe6d27671, 0xa7c234f4, 0x9d0176cc, 0xd0d73df2, 0x4d8bfa89,
+                 0x544f10cd, 0x2b17e0b2, 0xb70a5c7d, 0xfd86fe49, 0xdf373f41, 0x214495bb, 0x84e857fd,
+                 0x00d313d5, 0x0496fcbe, 0xa4ba4744, 0xe8cac982, 0xaec29e6e, 0x87ec7038, 0x7000a519,
+                 0xaeee333b, 0xff66e42c, 0x8afd6b25, 0x03b4f63b, 0xbd7991dc, 0x5ab8d9c7, 0x2ed4684e,
+                 0x48741a6c, 0xaf06940d, 0x2fdc6349, 0xb03d7ecd, 0xe974996f, 0xac7867f9, 0x52ec8721,
+                 0xbcdd9d4a, 0x8edd2d00, 0x3557de06, 0x41c759f8, 0x3956d4b9, 0xa75409f2, 0x123cd8a1,
+                 0xb6100fab, 0x3e7b21e2, 0x2e8d623b, 0x92959da2, 0xbca35f77, 0x200c03a5, 0x35fcb457,
+                 0x1bb6c6e4, 0xf74eb928, 0x3d5d0b54, 0x87cc1d21, 0x4964046f, 0x18ae4240, 0xd868b275,
+                 0x8bd2b496, 0x1c5563f4, 0xc234d8f5, 0xf868e970, 0xf9151fff, 0xae7be4a2, 0x271133ee,
+                 0xbb0fd922, 0x25254932, 0xa60a9fc0, 0x104bcd64, 0x30290145, 0x00000062,
+         }},
+};
+
+static int
+BigInt_IsZero(const BigInt *i)
+{
+    return i->length == 0;
+}
+
+/*
+ * Returns 1 if the value is even
+ */
+static int
+BigInt_IsEven(const BigInt *i)
+{
+    return (i->length == 0) || ((i->blocks[0] % 2) == 0);
+}
+
+static void
+BigInt_Copy(BigInt *dst, const BigInt *src)
+{
+    npy_uint32 length = src->length;
+    npy_uint32 *dstp = dst->blocks;
+    const npy_uint32 *srcp;
+    for (srcp = src->blocks; srcp != src->blocks + length; ++dstp, ++srcp) {
+        *dstp = *srcp;
+    }
+    dst->length = length;
+}
+
+/* result = result << shift */
+static void
+BigInt_ShiftLeft(BigInt *result, npy_uint32 shift)
+{
+    npy_uint32 shiftBlocks = shift / 32;
+    npy_uint32 shiftBits = shift % 32;
+
+    /* process blocks high to low so that we can safely process in place */
+    const npy_uint32 *pInBlocks = result->blocks;
+    npy_int32 inLength = result->length;
+    npy_uint32 *pInCur, *pOutCur;
+
+    DEBUG_ASSERT(inLength + shiftBlocks < c_BigInt_MaxBlocks);
+    DEBUG_ASSERT(shift != 0);
+
+    /* check if the shift is block aligned */
+    if (shiftBits == 0) {
+        npy_uint32 i;
+
+        /* copy blocks from high to low */
+        for (pInCur = result->blocks + result->length, pOutCur = pInCur + shiftBlocks;
+             pInCur >= pInBlocks; --pInCur, --pOutCur) {
+            *pOutCur = *pInCur;
+        }
+
+        /* zero the remaining low blocks */
+        for (i = 0; i < shiftBlocks; ++i) {
+            result->blocks[i] = 0;
+        }
+
+        result->length += shiftBlocks;
+    }
+    /* else we need to shift partial blocks */
+    else {
+        npy_uint32 i;
+        npy_int32 inBlockIdx = inLength - 1;
+        npy_uint32 outBlockIdx = inLength + shiftBlocks;
+
+        /* output the initial blocks */
+        const npy_uint32 lowBitsShift = (32 - shiftBits);
+        npy_uint32 highBits = 0;
+        npy_uint32 block = result->blocks[inBlockIdx];
+        npy_uint32 lowBits = block >> lowBitsShift;
+
+        /* set the length to hold the shifted blocks */
+        DEBUG_ASSERT(outBlockIdx < c_BigInt_MaxBlocks);
+        result->length = outBlockIdx + 1;
+
+        while (inBlockIdx > 0) {
+            result->blocks[outBlockIdx] = highBits | lowBits;
+            highBits = block << shiftBits;
+
+            --inBlockIdx;
+            --outBlockIdx;
+
+            block = result->blocks[inBlockIdx];
+            lowBits = block >> lowBitsShift;
+        }
+
+        /* output the final blocks */
+        DEBUG_ASSERT(outBlockIdx == shiftBlocks + 1);
+        result->blocks[outBlockIdx] = highBits | lowBits;
+        result->blocks[outBlockIdx - 1] = block << shiftBits;
+
+        /* zero the remaining low blocks */
+        for (i = 0; i < shiftBlocks; ++i) {
+            result->blocks[i] = 0;
+        }
+
+        /* check if the terminating block has no set bits */
+        if (result->blocks[result->length - 1] == 0) {
+            --result->length;
+        }
+    }
+}
+
+static void
+BigInt_Set_uint32(BigInt *i, npy_uint32 val)
+{
+    if (val != 0) {
+        i->blocks[0] = val;
+        i->length = 1;
+    }
+    else {
+        i->length = 0;
+    }
+}
+
+/* result = 2^exponent */
+static inline void
+BigInt_Pow2(BigInt *result, npy_uint32 exponent)
+{
+    npy_uint32 bitIdx;
+    npy_uint32 blockIdx = exponent / 32;
+    npy_uint32 i;
+
+    DEBUG_ASSERT(blockIdx < c_BigInt_MaxBlocks);
+
+    for (i = 0; i <= blockIdx; ++i) {
+        result->blocks[i] = 0;
+    }
+
+    result->length = blockIdx + 1;
+
+    bitIdx = (exponent % 32);
+    result->blocks[blockIdx] |= ((npy_uint32)1 << bitIdx);
+}
+
+static void
+BigInt_Set_2x_uint64(BigInt *i, npy_uint64 hi, npy_uint64 lo)
+{
+    if (hi > bitmask_u64(32)) {
+        i->length = 4;
+    }
+    else if (hi != 0) {
+        i->length = 3;
+    }
+    else if (lo > bitmask_u64(32)) {
+        i->length = 2;
+    }
+    else if (lo != 0) {
+        i->length = 1;
+    }
+    else {
+        i->length = 0;
+    }
+
+    /* Note deliberate fallthrough in this switch */
+    switch (i->length) {
+        case 4:
+            i->blocks[3] = (hi >> 32) & bitmask_u64(32);
+        case 3:
+            i->blocks[2] = hi & bitmask_u64(32);
+        case 2:
+            i->blocks[1] = (lo >> 32) & bitmask_u64(32);
+        case 1:
+            i->blocks[0] = lo & bitmask_u64(32);
+    }
+}
+
+/* result = lhs * rhs */
+static void
+BigInt_Multiply_int(BigInt *result, const BigInt *lhs, npy_uint32 rhs)
+{
+    /* perform long multiplication */
+    npy_uint32 carry = 0;
+    npy_uint32 *resultCur = result->blocks;
+    const npy_uint32 *pLhsCur = lhs->blocks;
+    const npy_uint32 *pLhsEnd = lhs->blocks + lhs->length;
+    for (; pLhsCur != pLhsEnd; ++pLhsCur, ++resultCur) {
+        npy_uint64 product = (npy_uint64)(*pLhsCur) * rhs + carry;
+        *resultCur = (npy_uint32)(product & bitmask_u64(32));
+        carry = product >> 32;
+    }
+
+    /* if there is a remaining carry, grow the array */
+    if (carry != 0) {
+        /* grow the array */
+        DEBUG_ASSERT(lhs->length + 1 <= c_BigInt_MaxBlocks);
+        *resultCur = (npy_uint32)carry;
+        result->length = lhs->length + 1;
+    }
+    else {
+        result->length = lhs->length;
+    }
+}
+
+/*
+ * result = lhs * rhs
+ */
+static void
+BigInt_Multiply(BigInt *result, const BigInt *lhs, const BigInt *rhs)
+{
+    const BigInt *large;
+    const BigInt *small;
+    npy_uint32 maxResultLen;
+    npy_uint32 *cur, *end, *resultStart;
+    const npy_uint32 *smallCur;
+
+    DEBUG_ASSERT(result != lhs && result != rhs);
+
+    /* determine which operand has the smaller length */
+    if (lhs->length < rhs->length) {
+        small = lhs;
+        large = rhs;
+    }
+    else {
+        small = rhs;
+        large = lhs;
+    }
+
+    /* set the maximum possible result length */
+    maxResultLen = large->length + small->length;
+    DEBUG_ASSERT(maxResultLen <= c_BigInt_MaxBlocks);
+
+    /* clear the result data */
+    for (cur = result->blocks, end = cur + maxResultLen; cur != end; ++cur) {
+        *cur = 0;
+    }
+
+    /* perform standard long multiplication for each small block */
+    resultStart = result->blocks;
+    for (smallCur = small->blocks; smallCur != small->blocks + small->length;
+         ++smallCur, ++resultStart) {
+        /*
+         * if non-zero, multiply against all the large blocks and add into the
+         * result
+         */
+        const npy_uint32 multiplier = *smallCur;
+        if (multiplier != 0) {
+            const npy_uint32 *largeCur = large->blocks;
+            npy_uint32 *resultCur = resultStart;
+            npy_uint64 carry = 0;
+            do {
+                npy_uint64 product = (*resultCur) + (*largeCur) * (npy_uint64)multiplier + carry;
+                carry = product >> 32;
+                *resultCur = product & bitmask_u64(32);
+                ++largeCur;
+                ++resultCur;
+            } while (largeCur != large->blocks + large->length);
+
+            DEBUG_ASSERT(resultCur < result->blocks + maxResultLen);
+            *resultCur = (npy_uint32)(carry & bitmask_u64(32));
+        }
+    }
+
+    /* check if the terminating block has no set bits */
+    if (maxResultLen > 0 && result->blocks[maxResultLen - 1] == 0) {
+        result->length = maxResultLen - 1;
+    }
+    else {
+        result->length = maxResultLen;
+    }
+}
+
+/* in = in * 10^exponent */
+static void
+BigInt_MultiplyPow10(BigInt *in, npy_uint32 exponent, BigInt *temp)
+{
+    /* use two temporary values to reduce large integer copy operations */
+    BigInt *curTemp, *pNextTemp;
+    npy_uint32 smallExponent;
+    npy_uint32 tableIdx = 0;
+
+    /* make sure the exponent is within the bounds of the lookup table data */
+    DEBUG_ASSERT(exponent < 8192);
+
+    /*
+     * initialize the result by looking up a 32-bit power of 10 corresponding to
+     * the first 3 bits
+     */
+    smallExponent = exponent & bitmask_u32(3);
+    if (smallExponent != 0) {
+        BigInt_Multiply_int(temp, in, g_PowerOf10_U32[smallExponent]);
+        curTemp = temp;
+        pNextTemp = in;
+    }
+    else {
+        curTemp = in;
+        pNextTemp = temp;
+    }
+
+    /* remove the low bits that we used for the 32-bit lookup table */
+    exponent >>= 3;
+
+    /* while there are remaining bits in the exponent to be processed */
+    while (exponent != 0) {
+        /* if the current bit is set, multiply by this power of 10 */
+        if (exponent & 1) {
+            BigInt *pSwap;
+
+            /* multiply into the next temporary */
+            BigInt_Multiply(pNextTemp, curTemp, &g_PowerOf10_Big[tableIdx]);
+
+            /* swap to the next temporary */
+            pSwap = curTemp;
+            curTemp = pNextTemp;
+            pNextTemp = pSwap;
+        }
+
+        /* advance to the next bit */
+        ++tableIdx;
+        exponent >>= 1;
+    }
+
+    /* output the result */
+    if (curTemp != in) {
+        BigInt_Copy(in, curTemp);
+    }
+}
+
+/* result = 10^exponent */
+static void
+BigInt_Pow10(BigInt *result, npy_uint32 exponent, BigInt *temp)
+{
+    /* use two temporary values to reduce large integer copy operations */
+    BigInt *curTemp = result;
+    BigInt *pNextTemp = temp;
+    npy_uint32 smallExponent;
+    npy_uint32 tableIdx = 0;
+
+    /* make sure the exponent is within the bounds of the lookup table data */
+    DEBUG_ASSERT(exponent < 8192);
+
+    /*
+     * initialize the result by looking up a 32-bit power of 10 corresponding to
+     * the first 3 bits
+     */
+    smallExponent = exponent & bitmask_u32(3);
+    BigInt_Set_uint32(curTemp, g_PowerOf10_U32[smallExponent]);
+
+    /* remove the low bits that we used for the 32-bit lookup table */
+    exponent >>= 3;
+
+    /* while there are remaining bits in the exponent to be processed */
+    while (exponent != 0) {
+        /* if the current bit is set, multiply by this power of 10 */
+        if (exponent & 1) {
+            BigInt *pSwap;
+
+            /* multiply into the next temporary */
+            BigInt_Multiply(pNextTemp, curTemp, &g_PowerOf10_Big[tableIdx]);
+
+            /* swap to the next temporary */
+            pSwap = curTemp;
+            curTemp = pNextTemp;
+            pNextTemp = pSwap;
+        }
+
+        /* advance to the next bit */
+        ++tableIdx;
+        exponent >>= 1;
+    }
+
+    /* output the result */
+    if (curTemp != result) {
+        BigInt_Copy(result, curTemp);
+    }
+}
+
+/* result = lhs + rhs */
+static void
+BigInt_Add(BigInt *result, const BigInt *lhs, const BigInt *rhs)
+{
+    /* determine which operand has the smaller length */
+    const BigInt *large, *small;
+    npy_uint64 carry = 0;
+    const npy_uint32 *largeCur, *smallCur, *largeEnd, *smallEnd;
+    npy_uint32 *resultCur;
+
+    if (lhs->length < rhs->length) {
+        small = lhs;
+        large = rhs;
+    }
+    else {
+        small = rhs;
+        large = lhs;
+    }
+
+    /* The output will be at least as long as the largest input */
+    result->length = large->length;
+
+    /* Add each block and add carry the overflow to the next block */
+    largeCur = large->blocks;
+    largeEnd = largeCur + large->length;
+    smallCur = small->blocks;
+    smallEnd = smallCur + small->length;
+    resultCur = result->blocks;
+    while (smallCur != smallEnd) {
+        npy_uint64 sum = carry + (npy_uint64)(*largeCur) + (npy_uint64)(*smallCur);
+        carry = sum >> 32;
+        *resultCur = sum & bitmask_u64(32);
+        ++largeCur;
+        ++smallCur;
+        ++resultCur;
+    }
+
+    /* Add the carry to any blocks that only exist in the large operand */
+    while (largeCur != largeEnd) {
+        npy_uint64 sum = carry + (npy_uint64)(*largeCur);
+        carry = sum >> 32;
+        (*resultCur) = sum & bitmask_u64(32);
+        ++largeCur;
+        ++resultCur;
+    }
+
+    /* If there's still a carry, append a new block */
+    if (carry != 0) {
+        DEBUG_ASSERT(carry == 1);
+        DEBUG_ASSERT((npy_uint32)(resultCur - result->blocks) == large->length &&
+                     (large->length < c_BigInt_MaxBlocks));
+        *resultCur = 1;
+        result->length = large->length + 1;
+    }
+    else {
+        result->length = large->length;
+    }
+}
+
+/* result = in * 2 */
+static void
+BigInt_Multiply2(BigInt *result, const BigInt *in)
+{
+    /* shift all the blocks by one */
+    npy_uint32 carry = 0;
+
+    npy_uint32 *resultCur = result->blocks;
+    const npy_uint32 *pLhsCur = in->blocks;
+    const npy_uint32 *pLhsEnd = in->blocks + in->length;
+    for (; pLhsCur != pLhsEnd; ++pLhsCur, ++resultCur) {
+        npy_uint32 cur = *pLhsCur;
+        *resultCur = (cur << 1) | carry;
+        carry = cur >> 31;
+    }
+
+    if (carry != 0) {
+        /* grow the array */
+        DEBUG_ASSERT(in->length + 1 <= c_BigInt_MaxBlocks);
+        *resultCur = carry;
+        result->length = in->length + 1;
+    }
+    else {
+        result->length = in->length;
+    }
+}
+
+/* result = result * 2 */
+static void
+BigInt_Multiply2_inplace(BigInt *result)
+{
+    /* shift all the blocks by one */
+    npy_uint32 carry = 0;
+
+    npy_uint32 *cur = result->blocks;
+    npy_uint32 *end = result->blocks + result->length;
+    for (; cur != end; ++cur) {
+        npy_uint32 tmpcur = *cur;
+        *cur = (tmpcur << 1) | carry;
+        carry = tmpcur >> 31;
+    }
+
+    if (carry != 0) {
+        /* grow the array */
+        DEBUG_ASSERT(result->length + 1 <= c_BigInt_MaxBlocks);
+        *cur = carry;
+        ++result->length;
+    }
+}
+
+static npy_int32
+BigInt_Compare(const BigInt *lhs, const BigInt *rhs)
+{
+    int i;
+
+    /* A bigger length implies a bigger number. */
+    npy_int32 lengthDiff = lhs->length - rhs->length;
+    if (lengthDiff != 0) {
+        return lengthDiff;
+    }
+
+    /* Compare blocks one by one from high to low. */
+    for (i = lhs->length - 1; i >= 0; --i) {
+        if (lhs->blocks[i] == rhs->blocks[i]) {
+            continue;
+        }
+        else if (lhs->blocks[i] > rhs->blocks[i]) {
+            return 1;
+        }
+        else {
+            return -1;
+        }
+    }
+
+    /* no blocks differed */
+    return 0;
+}
+
+static npy_uint32
+BigInt_DivideWithRemainder_MaxQuotient9(BigInt *dividend, const BigInt *divisor)
+{
+    npy_uint32 length, quotient;
+    const npy_uint32 *finalDivisorBlock;
+    npy_uint32 *finalDividendBlock;
+
+    /*
+     * Check that the divisor has been correctly shifted into range and that it
+     * is not smaller than the dividend in length.
+     */
+    DEBUG_ASSERT(!divisor->length == 0 && divisor->blocks[divisor->length - 1] >= 8 &&
+                 divisor->blocks[divisor->length - 1] < bitmask_u64(32) &&
+                 dividend->length <= divisor->length);
+
+    /*
+     * If the dividend is smaller than the divisor, the quotient is zero and the
+     * divisor is already the remainder.
+     */
+    length = divisor->length;
+    if (dividend->length < divisor->length) {
+        return 0;
+    }
+
+    finalDivisorBlock = divisor->blocks + length - 1;
+    finalDividendBlock = dividend->blocks + length - 1;
+
+    /*
+     * Compute an estimated quotient based on the high block value. This will
+     * either match the actual quotient or undershoot by one.
+     */
+    quotient = *finalDividendBlock / (*finalDivisorBlock + 1);
+    DEBUG_ASSERT(quotient <= 9);
+
+    /* Divide out the estimated quotient */
+    if (quotient != 0) {
+        /* dividend = dividend - divisor*quotient */
+        const npy_uint32 *divisorCur = divisor->blocks;
+        npy_uint32 *dividendCur = dividend->blocks;
+
+        npy_uint64 borrow = 0;
+        npy_uint64 carry = 0;
+        do {
+            npy_uint64 difference, product;
+
+            product = (npy_uint64)*divisorCur * (npy_uint64)quotient + carry;
+            carry = product >> 32;
+
+            difference = (npy_uint64)*dividendCur - (product & bitmask_u64(32)) - borrow;
+            borrow = (difference >> 32) & 1;
+
+            *dividendCur = difference & bitmask_u64(32);
+
+            ++divisorCur;
+            ++dividendCur;
+        } while (divisorCur <= finalDivisorBlock);
+
+        /* remove all leading zero blocks from dividend */
+        while (length > 0 && dividend->blocks[length - 1] == 0) {
+            --length;
+        }
+
+        dividend->length = length;
+    }
+
+    /*
+     * If the dividend is still larger than the divisor, we overshot our
+     * estimate quotient. To correct, we increment the quotient and subtract one
+     * more divisor from the dividend.
+     */
+    if (BigInt_Compare(dividend, divisor) >= 0) {
+        /* dividend = dividend - divisor */
+        const npy_uint32 *divisorCur = divisor->blocks;
+        npy_uint32 *dividendCur = dividend->blocks;
+        npy_uint64 borrow = 0;
+
+        ++quotient;
+
+        do {
+            npy_uint64 difference = (npy_uint64)*dividendCur - (npy_uint64)*divisorCur - borrow;
+            borrow = (difference >> 32) & 1;
+
+            *dividendCur = difference & bitmask_u64(32);
+
+            ++divisorCur;
+            ++dividendCur;
+        } while (divisorCur <= finalDivisorBlock);
+
+        /* remove all leading zero blocks from dividend */
+        while (length > 0 && dividend->blocks[length - 1] == 0) {
+            --length;
+        }
+
+        dividend->length = length;
+    }
+
+    return quotient;
+}
+
+static npy_uint32
+LogBase2_32(npy_uint32 val)
+{
+    static const npy_uint8 logTable[256] = {
+            0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+            4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+
+    npy_uint32 temp;
+
+    temp = val >> 24;
+    if (temp) {
+        return 24 + logTable[temp];
+    }
+
+    temp = val >> 16;
+    if (temp) {
+        return 16 + logTable[temp];
+    }
+
+    temp = val >> 8;
+    if (temp) {
+        return 8 + logTable[temp];
+    }
+
+    return logTable[val];
+}
+
+static npy_uint32
+LogBase2_64(npy_uint64 val)
+{
+    npy_uint64 temp;
+
+    temp = val >> 32;
+    if (temp) {
+        return 32 + LogBase2_32((npy_uint32)temp);
+    }
+
+    return LogBase2_32((npy_uint32)val);
+}
+
+static npy_uint32
+LogBase2_128(npy_uint64 hi, npy_uint64 lo)
+{
+    if (hi) {
+        return 64 + LogBase2_64(hi);
+    }
+
+    return LogBase2_64(lo);
+}
+
+static npy_uint32
+PrintInfNan(char *buffer, npy_uint32 bufferSize, npy_uint64 mantissa, npy_uint32 mantissaHexWidth,
+            char signbit)
+{
+    npy_uint32 maxPrintLen = bufferSize - 1;
+    npy_uint32 pos = 0;
+
+    DEBUG_ASSERT(bufferSize > 0);
+
+    /* Check for infinity */
+    if (mantissa == 0) {
+        npy_uint32 printLen;
+
+        /* only print sign for inf values (though nan can have a sign set) */
+        if (signbit == '+') {
+            if (pos < maxPrintLen - 1) {
+                buffer[pos++] = '+';
+            }
+        }
+        else if (signbit == '-') {
+            if (pos < maxPrintLen - 1) {
+                buffer[pos++] = '-';
+            }
+        }
+
+        /* copy and make sure the buffer is terminated */
+        printLen = (3 < maxPrintLen - pos) ? 3 : maxPrintLen - pos;
+        memcpy(buffer + pos, "inf", printLen);
+        buffer[pos + printLen] = '\0';
+        return pos + printLen;
+    }
+    else {
+        /* copy and make sure the buffer is terminated */
+        npy_uint32 printLen = (3 < maxPrintLen - pos) ? 3 : maxPrintLen - pos;
+        memcpy(buffer + pos, "nan", printLen);
+        buffer[pos + printLen] = '\0';
+
+        /*
+         *  For numpy we ignore unusual mantissa values for nan, but keep this
+         *  code in case we change our mind later.
+         *
+         * // append HEX value
+         * if (maxPrintLen > 3) {
+         *     printLen += PrintHex(buffer+3, bufferSize-3, mantissa,
+         *                          mantissaHexWidth);
+         * }
+         */
+
+        return pos + printLen;
+    }
+}
+
+static npy_uint32
+Dragon4(BigInt *bigints, const npy_int32 exponent, const npy_uint32 mantissaBit,
+        const npy_bool hasUnequalMargins, const DigitMode digitMode, const CutoffMode cutoffMode,
+        npy_int32 cutoff_max, npy_int32 cutoff_min, char *pOutBuffer, npy_uint32 bufferSize,
+        npy_int32 *pOutExponent)
+{
+    char *curDigit = pOutBuffer;
+
+    /*
+     * We compute values in integer format by rescaling as
+     *   mantissa = scaledValue / scale
+     *   marginLow = scaledMarginLow / scale
+     *   marginHigh = scaledMarginHigh / scale
+     * Here, marginLow and marginHigh represent 1/2 of the distance to the next
+     * floating point value above/below the mantissa.
+     *
+     * scaledMarginHigh will point to scaledMarginLow in the case they must be
+     * equal to each other, otherwise it will point to optionalMarginHigh.
+     */
+    BigInt *mantissa = &bigints[0]; /* the only initialized bigint */
+    BigInt *scale = &bigints[1];
+    BigInt *scaledValue = &bigints[2];
+    BigInt *scaledMarginLow = &bigints[3];
+    BigInt *scaledMarginHigh;
+    BigInt *optionalMarginHigh = &bigints[4];
+
+    BigInt *temp1 = &bigints[5];
+    BigInt *temp2 = &bigints[6];
+
+    const npy_float64 log10_2 = 0.30102999566398119521373889472449;
+    npy_int32 digitExponent, hiBlock;
+    npy_int32 cutoff_max_Exponent, cutoff_min_Exponent;
+    npy_uint32 outputDigit; /* current digit being output */
+    npy_uint32 outputLen;
+    npy_bool isEven = BigInt_IsEven(mantissa);
+    npy_int32 cmp;
+
+    /* values used to determine how to round */
+    npy_bool low, high, roundDown;
+
+    DEBUG_ASSERT(bufferSize > 0);
+
+    /* if the mantissa is zero, the value is zero regardless of the exponent */
+    if (BigInt_IsZero(mantissa)) {
+        *curDigit = '0';
+        *pOutExponent = 0;
+        return 1;
+    }
+
+    BigInt_Copy(scaledValue, mantissa);
+
+    if (hasUnequalMargins) {
+        /* if we have no fractional component */
+        if (exponent > 0) {
+            /*
+             * 1) Expand the input value by multiplying out the mantissa and
+             *    exponent. This represents the input value in its whole number
+             *    representation.
+             * 2) Apply an additional scale of 2 such that later comparisons
+             *    against the margin values are simplified.
+             * 3) Set the margin value to the lowest mantissa bit's scale.
+             */
+
+            /* scaledValue      = 2 * 2 * mantissa*2^exponent */
+            BigInt_ShiftLeft(scaledValue, exponent + 2);
+            /* scale            = 2 * 2 * 1 */
+            BigInt_Set_uint32(scale, 4);
+            /* scaledMarginLow  = 2 * 2^(exponent-1) */
+            BigInt_Pow2(scaledMarginLow, exponent);
+            /* scaledMarginHigh = 2 * 2 * 2^(exponent-1) */
+            BigInt_Pow2(optionalMarginHigh, exponent + 1);
+        }
+        /* else we have a fractional exponent */
+        else {
+            /*
+             * In order to track the mantissa data as an integer, we store it as
+             * is with a large scale
+             */
+
+            /* scaledValue      = 2 * 2 * mantissa */
+            BigInt_ShiftLeft(scaledValue, 2);
+            /* scale            = 2 * 2 * 2^(-exponent) */
+            BigInt_Pow2(scale, -exponent + 2);
+            /* scaledMarginLow  = 2 * 2^(-1) */
+            BigInt_Set_uint32(scaledMarginLow, 1);
+            /* scaledMarginHigh = 2 * 2 * 2^(-1) */
+            BigInt_Set_uint32(optionalMarginHigh, 2);
+        }
+
+        /* the high and low margins are different */
+        scaledMarginHigh = optionalMarginHigh;
+    }
+    else {
+        /* if we have no fractional component */
+        if (exponent > 0) {
+            /* scaledValue     = 2 * mantissa*2^exponent */
+            BigInt_ShiftLeft(scaledValue, exponent + 1);
+            /* scale           = 2 * 1 */
+            BigInt_Set_uint32(scale, 2);
+            /* scaledMarginLow = 2 * 2^(exponent-1) */
+            BigInt_Pow2(scaledMarginLow, exponent);
+        }
+        /* else we have a fractional exponent */
+        else {
+            /*
+             * In order to track the mantissa data as an integer, we store it as
+             * is with a large scale
+             */
+
+            /* scaledValue     = 2 * mantissa */
+            BigInt_ShiftLeft(scaledValue, 1);
+            /* scale           = 2 * 2^(-exponent) */
+            BigInt_Pow2(scale, -exponent + 1);
+            /* scaledMarginLow = 2 * 2^(-1) */
+            BigInt_Set_uint32(scaledMarginLow, 1);
+        }
+
+        /* the high and low margins are equal */
+        scaledMarginHigh = scaledMarginLow;
+    }
+
+    /*
+     * Compute an estimate for digitExponent that will be correct or undershoot
+     * by one.  This optimization is based on the paper "Printing Floating-Point
+     * Numbers Quickly and Accurately" by Burger and Dybvig
+     * https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.72.4656
+     * We perform an additional subtraction of 0.69 to increase the frequency of
+     * a failed estimate because that lets us take a faster branch in the code.
+     * 0.69 is chosen because 0.69 + log10(2) is less than one by a reasonable
+     * epsilon that will account for any floating point error.
+     *
+     * We want to set digitExponent to floor(log10(v)) + 1
+     *  v = mantissa*2^exponent
+     *  log2(v) = log2(mantissa) + exponent;
+     *  log10(v) = log2(v) * log10(2)
+     *  floor(log2(v)) = mantissaBit + exponent;
+     *  log10(v) - log10(2) < (mantissaBit + exponent) * log10(2) <= log10(v)
+     *  log10(v) < (mantissaBit + exponent) * log10(2) + log10(2)
+     *                                                 <= log10(v) + log10(2)
+     *  floor(log10(v)) < ceil((mantissaBit + exponent) * log10(2))
+     *                                                 <= floor(log10(v)) + 1
+     *
+     *  Warning: This calculation assumes npy_float64 is an IEEE-binary64
+     *  float. This line may need to be updated if this is not the case.
+     */
+    digitExponent =
+            (npy_int32)(ceil((npy_float64)((npy_int32)mantissaBit + exponent) * log10_2 - 0.69));
+
+    /*
+     * if the digit exponent is smaller than the smallest desired digit for
+     * fractional cutoff, pull the digit back into legal range at which point we
+     * will round to the appropriate value.  Note that while our value for
+     * digitExponent is still an estimate, this is safe because it only
+     * increases the number. This will either correct digitExponent to an
+     * accurate value or it will clamp it above the accurate value.
+     */
+    if (cutoff_max >= 0 && cutoffMode == CutoffMode_FractionLength &&
+        digitExponent <= -cutoff_max) {
+        digitExponent = -cutoff_max + 1;
+    }
+
+    /* Divide value by 10^digitExponent. */
+    if (digitExponent > 0) {
+        /* A positive exponent creates a division so we multiply the scale. */
+        BigInt_MultiplyPow10(scale, digitExponent, temp1);
+    }
+    else if (digitExponent < 0) {
+        /*
+         * A negative exponent creates a multiplication so we multiply up the
+         * scaledValue, scaledMarginLow and scaledMarginHigh.
+         */
+        BigInt *temp = temp1, *pow10 = temp2;
+        BigInt_Pow10(pow10, -digitExponent, temp);
+
+        BigInt_Multiply(temp, scaledValue, pow10);
+        BigInt_Copy(scaledValue, temp);
+
+        BigInt_Multiply(temp, scaledMarginLow, pow10);
+        BigInt_Copy(scaledMarginLow, temp);
+
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
+        }
+    }
+
+    /* If (value >= 1), our estimate for digitExponent was too low */
+    if (BigInt_Compare(scaledValue, scale) >= 0) {
+        /*
+         * The exponent estimate was incorrect.
+         * Increment the exponent and don't perform the premultiply needed
+         * for the first loop iteration.
+         */
+        digitExponent = digitExponent + 1;
+    }
+    else {
+        /*
+         * The exponent estimate was correct.
+         * Multiply larger by the output base to prepare for the first loop
+         * iteration.
+         */
+        BigInt_Multiply10(scaledValue);
+        BigInt_Multiply10(scaledMarginLow);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
+        }
+    }
+
+    /*
+     * Compute the cutoff_max exponent (the exponent of the final digit to
+     * print).  Default to the maximum size of the output buffer.
+     */
+    cutoff_max_Exponent = digitExponent - bufferSize;
+    if (cutoff_max >= 0) {
+        npy_int32 desiredCutoffExponent;
+
+        if (cutoffMode == CutoffMode_TotalLength) {
+            desiredCutoffExponent = digitExponent - cutoff_max;
+            if (desiredCutoffExponent > cutoff_max_Exponent) {
+                cutoff_max_Exponent = desiredCutoffExponent;
+            }
+        }
+        /* Otherwise it's CutoffMode_FractionLength. Print cutoff_max digits
+         * past the decimal point or until we reach the buffer size
+         */
+        else {
+            desiredCutoffExponent = -cutoff_max;
+            if (desiredCutoffExponent > cutoff_max_Exponent) {
+                cutoff_max_Exponent = desiredCutoffExponent;
+            }
+        }
+    }
+    /* Also compute the cutoff_min exponent. */
+    cutoff_min_Exponent = digitExponent;
+    if (cutoff_min >= 0) {
+        npy_int32 desiredCutoffExponent;
+
+        if (cutoffMode == CutoffMode_TotalLength) {
+            desiredCutoffExponent = digitExponent - cutoff_min;
+            if (desiredCutoffExponent < cutoff_min_Exponent) {
+                cutoff_min_Exponent = desiredCutoffExponent;
+            }
+        }
+        else {
+            desiredCutoffExponent = -cutoff_min;
+            if (desiredCutoffExponent < cutoff_min_Exponent) {
+                cutoff_min_Exponent = desiredCutoffExponent;
+            }
+        }
+    }
+
+    /* Output the exponent of the first digit we will print */
+    *pOutExponent = digitExponent - 1;
+
+    /*
+     * In preparation for calling BigInt_DivideWithRemainder_MaxQuotient9(), we
+     * need to scale up our values such that the highest block of the
+     * denominator is greater than or equal to 8. We also need to guarantee that
+     * the numerator can never have a length greater than the denominator after
+     * each loop iteration.  This requires the highest block of the denominator
+     * to be less than or equal to 429496729 which is the highest number that
+     * can be multiplied by 10 without overflowing to a new block.
+     */
+    DEBUG_ASSERT(scale->length > 0);
+    hiBlock = scale->blocks[scale->length - 1];
+    if (hiBlock < 8 || hiBlock > 429496729) {
+        npy_uint32 hiBlockLog2, shift;
+
+        /*
+         * Perform a bit shift on all values to get the highest block of the
+         * denominator into the range [8,429496729]. We are more likely to make
+         * accurate quotient estimations in
+         * BigInt_DivideWithRemainder_MaxQuotient9() with higher denominator
+         * values so we shift the denominator to place the highest bit at index
+         * 27 of the highest block.  This is safe because (2^28 - 1) = 268435455
+         * which is less than 429496729. This means that all values with a
+         * highest bit at index 27 are within range.
+         */
+        hiBlockLog2 = LogBase2_32(hiBlock);
+        DEBUG_ASSERT(hiBlockLog2 < 3 || hiBlockLog2 > 27);
+        shift = (32 + 27 - hiBlockLog2) % 32;
+
+        BigInt_ShiftLeft(scale, shift);
+        BigInt_ShiftLeft(scaledValue, shift);
+        BigInt_ShiftLeft(scaledMarginLow, shift);
+        if (scaledMarginHigh != scaledMarginLow) {
+            BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
+        }
+    }
+
+    if (digitMode == DigitMode_Unique) {
+        /*
+         * For the unique cutoff mode, we will try to print until we have
+         * reached a level of precision that uniquely distinguishes this value
+         * from its neighbors. If we run out of space in the output buffer, we
+         * terminate early.
+         */
+        for (;;) {
+            BigInt *scaledValueHigh = temp1;
+
+            digitExponent = digitExponent - 1;
+
+            /* divide out the scale to extract the digit */
+            outputDigit = BigInt_DivideWithRemainder_MaxQuotient9(scaledValue, scale);
+            DEBUG_ASSERT(outputDigit < 10);
+
+            /* update the high end of the value */
+            BigInt_Add(scaledValueHigh, scaledValue, scaledMarginHigh);
+
+            /*
+             * stop looping if we are far enough away from our neighboring
+             * values (and we have printed at least the requested minimum
+             * digits) or if we have reached the cutoff digit
+             */
+            cmp = BigInt_Compare(scaledValue, scaledMarginLow);
+            low = isEven ? (cmp <= 0) : (cmp < 0);
+            cmp = BigInt_Compare(scaledValueHigh, scale);
+            high = isEven ? (cmp >= 0) : (cmp > 0);
+            if (((low | high) & (digitExponent <= cutoff_min_Exponent)) |
+                (digitExponent == cutoff_max_Exponent)) {
+                break;
+            }
+
+            /* store the output digit */
+            *curDigit = (char)('0' + outputDigit);
+            ++curDigit;
+
+            /* multiply larger by the output base */
+            BigInt_Multiply10(scaledValue);
+            BigInt_Multiply10(scaledMarginLow);
+            if (scaledMarginHigh != scaledMarginLow) {
+                BigInt_Multiply2(scaledMarginHigh, scaledMarginLow);
+            }
+        }
+    }
+    else {
+        /*
+         * For exact digit mode, we will try to print until we
+         * have exhausted all precision (i.e. all remaining digits are zeros) or
+         * until we reach the desired cutoff digit.
+         */
+        low = NPY_FALSE;
+        high = NPY_FALSE;
+
+        for (;;) {
+            digitExponent = digitExponent - 1;
+
+            /* divide out the scale to extract the digit */
+            outputDigit = BigInt_DivideWithRemainder_MaxQuotient9(scaledValue, scale);
+            DEBUG_ASSERT(outputDigit < 10);
+
+            if ((scaledValue->length == 0) | (digitExponent == cutoff_max_Exponent)) {
+                break;
+            }
+
+            /* store the output digit */
+            *curDigit = (char)('0' + outputDigit);
+            ++curDigit;
+
+            /* multiply larger by the output base */
+            BigInt_Multiply10(scaledValue);
+        }
+    }
+
+    /* default to rounding down the final digit if value got too close to 0 */
+    roundDown = low;
+
+    /* if it is legal to round up and down */
+    if (low == high) {
+        npy_int32 compare;
+
+        /*
+         * round to the closest digit by comparing value with 0.5. To do this we
+         * need to convert the inequality to large integer values.
+         *  compare( value, 0.5 )
+         *  compare( scale * value, scale * 0.5 )
+         *  compare( 2 * scale * value, scale )
+         */
+        BigInt_Multiply2_inplace(scaledValue);
+        compare = BigInt_Compare(scaledValue, scale);
+        roundDown = compare < 0;
+
+        /*
+         * if we are directly in the middle, round towards the even digit (i.e.
+         * IEEE rounding rules)
+         */
+        if (compare == 0) {
+            roundDown = (outputDigit & 1) == 0;
+        }
+    }
+
+    /* print the rounded digit */
+    if (roundDown) {
+        *curDigit = (char)('0' + outputDigit);
+        ++curDigit;
+    }
+    else {
+        /* handle rounding up */
+        if (outputDigit == 9) {
+            /* find the first non-nine prior digit */
+            for (;;) {
+                /* if we are at the first digit */
+                if (curDigit == pOutBuffer) {
+                    /* output 1 at the next highest exponent */
+                    *curDigit = '1';
+                    ++curDigit;
+                    *pOutExponent += 1;
+                    break;
+                }
+
+                --curDigit;
+                if (*curDigit != '9') {
+                    /* increment the digit */
+                    *curDigit += 1;
+                    ++curDigit;
+                    break;
+                }
+            }
+        }
+        else {
+            /* values in the range [0,8] can perform a simple round up */
+            *curDigit = (char)('0' + outputDigit + 1);
+            ++curDigit;
+        }
+    }
+
+    /* return the number of digits output */
+    outputLen = (npy_uint32)(curDigit - pOutBuffer);
+    DEBUG_ASSERT(outputLen <= bufferSize);
+    return outputLen;
+}
+
+static npy_uint32
+FormatPositional(char *buffer, npy_uint32 bufferSize, BigInt *mantissa, npy_int32 exponent,
+                 char signbit, npy_uint32 mantissaBit, npy_bool hasUnequalMargins,
+                 DigitMode digit_mode, CutoffMode cutoff_mode, npy_int32 precision,
+                 npy_int32 min_digits, TrimMode trim_mode, npy_int32 digits_left,
+                 npy_int32 digits_right)
+{
+    npy_int32 printExponent;
+    npy_int32 numDigits, numWholeDigits = 0, has_sign = 0;
+    npy_int32 add_digits;
+
+    npy_int32 maxPrintLen = (npy_int32)bufferSize - 1, pos = 0;
+
+    /* track the # of digits past the decimal point that have been printed */
+    npy_int32 numFractionDigits = 0, desiredFractionalDigits;
+
+    DEBUG_ASSERT(bufferSize > 0);
+
+    if (digit_mode != DigitMode_Unique) {
+        DEBUG_ASSERT(precision >= 0);
+    }
+
+    if (signbit == '+' && pos < maxPrintLen) {
+        buffer[pos++] = '+';
+        has_sign = 1;
+    }
+    else if (signbit == '-' && pos < maxPrintLen) {
+        buffer[pos++] = '-';
+        has_sign = 1;
+    }
+
+    numDigits = Dragon4(mantissa, exponent, mantissaBit, hasUnequalMargins, digit_mode, cutoff_mode,
+                        precision, min_digits, buffer + has_sign, maxPrintLen - has_sign,
+                        &printExponent);
+
+    DEBUG_ASSERT(numDigits > 0);
+    DEBUG_ASSERT(numDigits <= bufferSize);
+
+    /* if output has a whole number */
+    if (printExponent >= 0) {
+        /* leave the whole number at the start of the buffer */
+        numWholeDigits = printExponent + 1;
+        if (numDigits <= numWholeDigits) {
+            npy_int32 count = numWholeDigits - numDigits;
+            pos += numDigits;
+
+            /* don't overflow the buffer */
+            if (pos + count > maxPrintLen) {
+                count = maxPrintLen - pos;
+            }
+
+            /* add trailing zeros up to the decimal point */
+            numDigits += count;
+            for (; count > 0; count--) {
+                buffer[pos++] = '0';
+            }
+        }
+        /* insert the decimal point prior to the fraction */
+        else if (numDigits > numWholeDigits) {
+            npy_int32 maxFractionDigits;
+
+            numFractionDigits = numDigits - numWholeDigits;
+            maxFractionDigits = maxPrintLen - numWholeDigits - 1 - pos;
+            if (numFractionDigits > maxFractionDigits) {
+                numFractionDigits = maxFractionDigits;
+            }
+
+            memmove(buffer + pos + numWholeDigits + 1, buffer + pos + numWholeDigits,
+                    numFractionDigits);
+            pos += numWholeDigits;
+            buffer[pos] = '.';
+            numDigits = numWholeDigits + 1 + numFractionDigits;
+            pos += 1 + numFractionDigits;
+        }
+    }
+    else {
+        /* shift out the fraction to make room for the leading zeros */
+        npy_int32 numFractionZeros = 0;
+        if (pos + 2 < maxPrintLen) {
+            npy_int32 maxFractionZeros, digitsStartIdx, maxFractionDigits, i;
+
+            maxFractionZeros = maxPrintLen - 2 - pos;
+            numFractionZeros = -(printExponent + 1);
+            if (numFractionZeros > maxFractionZeros) {
+                numFractionZeros = maxFractionZeros;
+            }
+
+            digitsStartIdx = 2 + numFractionZeros;
+
+            /*
+             * shift the significant digits right such that there is room for
+             * leading zeros
+             */
+            numFractionDigits = numDigits;
+            maxFractionDigits = maxPrintLen - digitsStartIdx - pos;
+            if (numFractionDigits > maxFractionDigits) {
+                numFractionDigits = maxFractionDigits;
+            }
+
+            memmove(buffer + pos + digitsStartIdx, buffer + pos, numFractionDigits);
+
+            /* insert the leading zeros */
+            for (i = 2; i < digitsStartIdx; ++i) {
+                buffer[pos + i] = '0';
+            }
+
+            /* update the counts */
+            numFractionDigits += numFractionZeros;
+            numDigits = numFractionDigits;
+        }
+
+        /* add the decimal point */
+        if (pos + 1 < maxPrintLen) {
+            buffer[pos + 1] = '.';
+        }
+
+        /* add the initial zero */
+        if (pos < maxPrintLen) {
+            buffer[pos] = '0';
+            numDigits += 1;
+        }
+        numWholeDigits = 1;
+        pos += 2 + numFractionDigits;
+    }
+
+    /* always add decimal point, except for DprZeros mode */
+    if (trim_mode != TrimMode_DptZeros && numFractionDigits == 0 && pos < maxPrintLen) {
+        buffer[pos++] = '.';
+    }
+
+    add_digits = digit_mode == DigitMode_Unique ? min_digits : precision;
+    desiredFractionalDigits = add_digits < 0 ? 0 : add_digits;
+    if (cutoff_mode == CutoffMode_TotalLength) {
+        desiredFractionalDigits = add_digits - numWholeDigits;
+    }
+
+    if (trim_mode == TrimMode_LeaveOneZero) {
+        /* if we didn't print any fractional digits, add a trailing 0 */
+        if (numFractionDigits == 0 && pos < maxPrintLen) {
+            buffer[pos++] = '0';
+            numFractionDigits++;
+        }
+    }
+    else if (trim_mode == TrimMode_None && desiredFractionalDigits > numFractionDigits &&
+             pos < maxPrintLen) {
+        /* add trailing zeros up to add_digits length */
+        /* compute the number of trailing zeros needed */
+        npy_int32 count = desiredFractionalDigits - numFractionDigits;
+        if (pos + count > maxPrintLen) {
+            count = maxPrintLen - pos;
+        }
+        numFractionDigits += count;
+
+        for (; count > 0; count--) {
+            buffer[pos++] = '0';
+        }
+    }
+    /* else, for trim_mode Zeros or DptZeros, there is nothing more to add */
+
+    /*
+     * when rounding, we may still end up with trailing zeros. Remove them
+     * depending on trim settings.
+     */
+    if (trim_mode != TrimMode_None && numFractionDigits > 0) {
+        while (buffer[pos - 1] == '0') {
+            pos--;
+            numFractionDigits--;
+        }
+        if (buffer[pos - 1] == '.') {
+            /* in TrimMode_LeaveOneZero, add trailing 0 back */
+            if (trim_mode == TrimMode_LeaveOneZero) {
+                buffer[pos++] = '0';
+                numFractionDigits++;
+            }
+            /* in TrimMode_DptZeros, remove trailing decimal point */
+            else if (trim_mode == TrimMode_DptZeros) {
+                pos--;
+            }
+        }
+    }
+
+    /* add any whitespace padding to right side */
+    if (digits_right >= numFractionDigits) {
+        npy_int32 count = digits_right - numFractionDigits;
+
+        /* in trim_mode DptZeros, if right padding, add a space for the . */
+        if (trim_mode == TrimMode_DptZeros && numFractionDigits == 0 && pos < maxPrintLen) {
+            buffer[pos++] = ' ';
+        }
+
+        if (pos + count > maxPrintLen) {
+            count = maxPrintLen - pos;
+        }
+
+        for (; count > 0; count--) {
+            buffer[pos++] = ' ';
+        }
+    }
+    /* add any whitespace padding to left side */
+    if (digits_left > numWholeDigits + has_sign) {
+        npy_int32 shift = digits_left - (numWholeDigits + has_sign);
+        npy_int32 count = pos;
+
+        if (count + shift > maxPrintLen) {
+            count = maxPrintLen - shift;
+        }
+
+        if (count > 0) {
+            memmove(buffer + shift, buffer, count);
+        }
+        pos = shift + count;
+        for (; shift > 0; shift--) {
+            buffer[shift - 1] = ' ';
+        }
+    }
+
+    /* terminate the buffer */
+    DEBUG_ASSERT(pos <= maxPrintLen);
+    buffer[pos] = '\0';
+
+    return pos;
+}
+
+static npy_uint32
+FormatScientific(char *buffer, npy_uint32 bufferSize, BigInt *mantissa, npy_int32 exponent,
+                 char signbit, npy_uint32 mantissaBit, npy_bool hasUnequalMargins,
+                 DigitMode digit_mode, npy_int32 precision, npy_int32 min_digits,
+                 TrimMode trim_mode, npy_int32 digits_left, npy_int32 exp_digits)
+{
+    npy_int32 printExponent;
+    npy_int32 numDigits;
+    char *pCurOut;
+    npy_int32 numFractionDigits;
+    npy_int32 leftchars;
+    npy_int32 add_digits;
+
+    if (digit_mode != DigitMode_Unique) {
+        DEBUG_ASSERT(precision >= 0);
+    }
+
+    DEBUG_ASSERT(bufferSize > 0);
+
+    pCurOut = buffer;
+
+    /* add any whitespace padding to left side */
+    leftchars = 1 + (signbit == '-' || signbit == '+');
+    if (digits_left > leftchars) {
+        int i;
+        for (i = 0; i < digits_left - leftchars && bufferSize > 1; i++) {
+            *pCurOut = ' ';
+            pCurOut++;
+            --bufferSize;
+        }
+    }
+
+    if (signbit == '+' && bufferSize > 1) {
+        *pCurOut = '+';
+        pCurOut++;
+        --bufferSize;
+    }
+    else if (signbit == '-' && bufferSize > 1) {
+        *pCurOut = '-';
+        pCurOut++;
+        --bufferSize;
+    }
+
+    numDigits = Dragon4(mantissa, exponent, mantissaBit, hasUnequalMargins, digit_mode,
+                        CutoffMode_TotalLength, precision < 0 ? -1 : precision + 1,
+                        min_digits < 0 ? -1 : min_digits + 1, pCurOut, bufferSize, &printExponent);
+
+    DEBUG_ASSERT(numDigits > 0);
+    DEBUG_ASSERT(numDigits <= bufferSize);
+
+    /* keep the whole number as the first digit */
+    if (bufferSize > 1) {
+        pCurOut += 1;
+        bufferSize -= 1;
+    }
+
+    /* insert the decimal point prior to the fractional number */
+    numFractionDigits = numDigits - 1;
+    if (numFractionDigits > 0 && bufferSize > 1) {
+        npy_int32 maxFractionDigits = (npy_int32)bufferSize - 2;
+
+        if (numFractionDigits > maxFractionDigits) {
+            numFractionDigits = maxFractionDigits;
+        }
+
+        memmove(pCurOut + 1, pCurOut, numFractionDigits);
+        pCurOut[0] = '.';
+        pCurOut += (1 + numFractionDigits);
+        bufferSize -= (1 + numFractionDigits);
+    }
+
+    /* always add decimal point, except for DprZeros mode */
+    if (trim_mode != TrimMode_DptZeros && numFractionDigits == 0 && bufferSize > 1) {
+        *pCurOut = '.';
+        ++pCurOut;
+        --bufferSize;
+    }
+
+    add_digits = digit_mode == DigitMode_Unique ? min_digits : precision;
+    add_digits = add_digits < 0 ? 0 : add_digits;
+    if (trim_mode == TrimMode_LeaveOneZero) {
+        /* if we didn't print any fractional digits, add the 0 */
+        if (numFractionDigits == 0 && bufferSize > 1) {
+            *pCurOut = '0';
+            ++pCurOut;
+            --bufferSize;
+            ++numFractionDigits;
+        }
+    }
+    else if (trim_mode == TrimMode_None) {
+        /* add trailing zeros up to add_digits length */
+        if (add_digits > (npy_int32)numFractionDigits) {
+            char *pEnd;
+            /* compute the number of trailing zeros needed */
+            npy_int32 numZeros = (add_digits - numFractionDigits);
+
+            if (numZeros > (npy_int32)bufferSize - 1) {
+                numZeros = (npy_int32)bufferSize - 1;
+            }
+
+            for (pEnd = pCurOut + numZeros; pCurOut < pEnd; ++pCurOut) {
+                *pCurOut = '0';
+                ++numFractionDigits;
+            }
+        }
+    }
+    /* else, for trim_mode Zeros or DptZeros, there is nothing more to add */
+
+    /*
+     * when rounding, we may still end up with trailing zeros. Remove them
+     * depending on trim settings.
+     */
+    if (trim_mode != TrimMode_None && numFractionDigits > 0) {
+        --pCurOut;
+        while (*pCurOut == '0') {
+            --pCurOut;
+            ++bufferSize;
+            --numFractionDigits;
+        }
+        if (trim_mode == TrimMode_LeaveOneZero && *pCurOut == '.') {
+            ++pCurOut;
+            *pCurOut = '0';
+            --bufferSize;
+            ++numFractionDigits;
+        }
+        ++pCurOut;
+    }
+
+    /* print the exponent into a local buffer and copy into output buffer */
+    if (bufferSize > 1) {
+        char exponentBuffer[7];
+        npy_int32 digits[5];
+        npy_int32 i, exp_size, count;
+
+        if (exp_digits > 5) {
+            exp_digits = 5;
+        }
+        if (exp_digits < 0) {
+            exp_digits = 2;
+        }
+
+        exponentBuffer[0] = 'e';
+        if (printExponent >= 0) {
+            exponentBuffer[1] = '+';
+        }
+        else {
+            exponentBuffer[1] = '-';
+            printExponent = -printExponent;
+        }
+
+        DEBUG_ASSERT(printExponent < 100000);
+
+        /* get exp digits */
+        for (i = 0; i < 5; i++) {
+            digits[i] = printExponent % 10;
+            printExponent /= 10;
+        }
+        /* count back over leading zeros */
+        for (i = 5; i > exp_digits && digits[i - 1] == 0; i--) {
+        }
+        exp_size = i;
+        /* write remaining digits to tmp buf */
+        for (i = exp_size; i > 0; i--) {
+            exponentBuffer[2 + (exp_size - i)] = (char)('0' + digits[i - 1]);
+        }
+
+        /* copy the exponent buffer into the output */
+        count = exp_size + 2;
+        if (count > (npy_int32)bufferSize - 1) {
+            count = (npy_int32)bufferSize - 1;
+        }
+        memcpy(pCurOut, exponentBuffer, count);
+        pCurOut += count;
+        bufferSize -= count;
+    }
+
+    DEBUG_ASSERT(bufferSize > 0);
+    pCurOut[0] = '\0';
+
+    return pCurOut - buffer;
+}
+
+static npy_uint32
+Format_floatbits(char *buffer, npy_uint32 bufferSize, BigInt *mantissa, npy_int32 exponent,
+                 char signbit, npy_uint32 mantissaBit, npy_bool hasUnequalMargins,
+                 Dragon4_Options *opt)
+{
+    /* format the value */
+    if (opt->scientific) {
+        return FormatScientific(buffer, bufferSize, mantissa, exponent, signbit, mantissaBit,
+                                hasUnequalMargins, opt->digit_mode, opt->precision, opt->min_digits,
+                                opt->trim_mode, opt->digits_left, opt->exp_digits);
+    }
+    else {
+        return FormatPositional(buffer, bufferSize, mantissa, exponent, signbit, mantissaBit,
+                                hasUnequalMargins, opt->digit_mode, opt->cutoff_mode,
+                                opt->precision, opt->min_digits, opt->trim_mode, opt->digits_left,
+                                opt->digits_right);
+    }
+}
+
+static npy_uint32
+Dragon4_PrintFloat_Sleef_quad(Sleef_quad *value, Dragon4_Options *opt)
+{
+    char *buffer = _bigint_static.repr;
+    const npy_uint32 bufferSize = sizeof(_bigint_static.repr);
+    BigInt *bigints = _bigint_static.bigints;
+
+    npy_uint32 floatExponent, floatSign;
+    npy_uint64 mantissa_hi, mantissa_lo;
+    npy_int32 exponent;
+    npy_uint32 mantissaBit;
+    npy_bool hasUnequalMargins;
+    char signbit = '\0';
+
+    // Extract the bits from the SLEEF quad value
+    union {
+        Sleef_quad q;
+        struct {
+            npy_uint64 lo;
+            npy_uint64 hi;
+        } i;
+    } u;
+    u.q = *value;
+
+    // Extract mantissa, exponent, and sign
+    mantissa_hi = u.i.hi & bitmask_u64(48);
+    mantissa_lo = u.i.lo;
+    floatExponent = (u.i.hi >> 48) & bitmask_u32(15);
+    floatSign = u.i.hi >> 63;
+
+    /* output the sign */
+    if (floatSign != 0) {
+        signbit = '-';
+    }
+    // else if (opt->sign) {
+    //     signbit = '+';
+    // }
+
+    /* if this is a special value */
+    if (floatExponent == bitmask_u32(15)) {
+        npy_uint64 mantissa_zero = mantissa_hi == 0 && mantissa_lo == 0;
+        return PrintInfNan(buffer, bufferSize, !mantissa_zero, 16, signbit);
+    }
+    /* else this is a number */
+
+    /* factor the value into its parts */
+    if (floatExponent != 0) {
+        /* normal */
+        mantissa_hi = (1ull << 48) | mantissa_hi;
+        /* mantissa_lo is unchanged */
+        exponent = floatExponent - 16383 - 112;
+        mantissaBit = 112;
+        hasUnequalMargins = (floatExponent != 1) && (mantissa_hi == 0 && mantissa_lo == 0);
+    }
+    else {
+        /* subnormal */
+        exponent = 1 - 16383 - 112;
+        mantissaBit = LogBase2_128(mantissa_hi, mantissa_lo);
+        hasUnequalMargins = NPY_FALSE;
+    }
+
+    BigInt_Set_2x_uint64(&bigints[0], mantissa_hi, mantissa_lo);
+    return Format_floatbits(buffer, bufferSize, bigints, exponent, signbit, mantissaBit,
+                            hasUnequalMargins, opt);
+}
+
+PyObject *
+Dragon4_Positional_QuadDType_opt(Sleef_quad *val, Dragon4_Options *opt)
+{
+    PyObject *ret;
+    if (Dragon4_PrintFloat_Sleef_quad(val, opt) < 0) {
+        return NULL;
+    }
+    ret = PyUnicode_FromString(_bigint_static.repr);
+    return ret;
+}
+
+PyObject *
+Dragon4_Positional_QuadDType(Sleef_quad *val, DigitMode digit_mode, CutoffMode cutoff_mode,
+                             int precision, int min_digits, int sign, TrimMode trim, int pad_left,
+                             int pad_right)
+{
+    Dragon4_Options opt;
+
+    opt.scientific = 0;
+    opt.digit_mode = digit_mode;
+    opt.cutoff_mode = cutoff_mode;
+    opt.precision = precision;
+    opt.min_digits = min_digits;
+    opt.sign = sign;
+    opt.trim_mode = trim;
+    opt.digits_left = pad_left;
+    opt.digits_right = pad_right;
+    opt.exp_digits = -1;
+
+    return Dragon4_Positional_QuadDType_opt(val, &opt);
+}
+
+PyObject *
+Dragon4_Scientific_QuadDType_opt(Sleef_quad *val, Dragon4_Options *opt)
+{
+    PyObject *ret;
+    if (Dragon4_PrintFloat_Sleef_quad(val, opt) < 0) {
+        return NULL;
+    }
+    ret = PyUnicode_FromString(_bigint_static.repr);
+    return ret;
+}
+
+PyObject *
+Dragon4_Scientific_QuadDType(Sleef_quad *val, DigitMode digit_mode, int precision, int min_digits,
+                             int sign, TrimMode trim, int pad_left, int exp_digits)
+{
+    Dragon4_Options opt;
+
+    opt.scientific = 1;
+    opt.digit_mode = digit_mode;
+    opt.cutoff_mode = CutoffMode_TotalLength;
+    opt.precision = precision;
+    opt.min_digits = min_digits;
+    opt.sign = sign;
+    opt.trim_mode = trim;
+    opt.digits_left = pad_left;
+    opt.digits_right = -1;
+    opt.exp_digits = exp_digits;
+
+    return Dragon4_Scientific_QuadDType_opt(val, &opt);
+}
+
+PyObject *
+Dragon4_Positional(PyObject *obj, DigitMode digit_mode, CutoffMode cutoff_mode, int precision,
+                   int min_digits, int sign, TrimMode trim, int pad_left, int pad_right)
+{
+    npy_double v;
+
+    if (PyArray_IsScalar(obj, QuadPrecDType)) {
+        QuadPrecisionObject *quad_obj = (QuadPrecisionObject *)obj;
+        if (quad_obj->backend == BACKEND_SLEEF) {
+            return Dragon4_Positional_QuadDType(&quad_obj->value.sleef_value, digit_mode,
+                                                cutoff_mode, precision, min_digits, sign, trim,
+                                                pad_left, pad_right);
+        }
+        else {
+            Sleef_quad sleef_val = Sleef_cast_from_doubleq1(quad_obj->value.longdouble_value);
+            return Dragon4_Positional_QuadDType(&sleef_val, digit_mode, cutoff_mode, precision,
+                                                min_digits, sign, trim, pad_left, pad_right);
+        }
+    }
+
+    return NULL;
+}
+
+PyObject *
+Dragon4_Scientific(PyObject *obj, DigitMode digit_mode, int precision, int min_digits, int sign,
+                   TrimMode trim, int pad_left, int exp_digits)
+{
+    npy_double val;
+
+    if (PyArray_IsScalar(obj, QuadPrecDType)) {
+        QuadPrecisionObject *quad_obj = (QuadPrecisionObject *)obj;
+        if (quad_obj->backend == BACKEND_SLEEF) {
+            return Dragon4_Scientific_QuadDType(&quad_obj->value.sleef_value, digit_mode, precision,
+                                                min_digits, sign, trim, pad_left, exp_digits);
+        }
+        else {
+            Sleef_quad sleef_val = Sleef_cast_from_doubleq1(quad_obj->value.longdouble_value);
+            return Dragon4_Scientific_QuadDType(&sleef_val, digit_mode, precision, min_digits, sign,
+                                                trim, pad_left, exp_digits);
+        }
+    }
+
+    return NULL;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/dragon4.h b/quaddtype/numpy_quaddtype/src/dragon4.h
new file mode 100644
index 00000000..1977595e
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/dragon4.h
@@ -0,0 +1,70 @@
+#ifndef _QUADDTYPE_DRAGON4_H
+#define _QUADDTYPE_DRAGON4_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+#include "numpy/arrayobject.h"
+#include <sleef.h>
+#include "quad_common.h"
+
+typedef enum DigitMode
+{
+    /* Round digits to print shortest uniquely identifiable number. */
+    DigitMode_Unique,
+    /* Output the digits of the number as if with infinite precision */
+    DigitMode_Exact,
+} DigitMode;
+
+typedef enum CutoffMode
+{
+    /* up to cutoffNumber significant digits */
+    CutoffMode_TotalLength,
+    /* up to cutoffNumber significant digits past the decimal point */
+    CutoffMode_FractionLength,
+} CutoffMode;
+
+typedef enum TrimMode
+{
+    TrimMode_None,         /* don't trim zeros, always leave a decimal point */
+    TrimMode_LeaveOneZero, /* trim all but the zero before the decimal point */
+    TrimMode_Zeros,        /* trim all trailing zeros, leave decimal point */
+    TrimMode_DptZeros,     /* trim trailing zeros & trailing decimal point */
+} TrimMode;
+
+typedef struct {
+    int scientific;
+    DigitMode digit_mode;
+    CutoffMode cutoff_mode;
+    int precision;
+    int min_digits;
+    int sign;
+    TrimMode trim_mode;
+    int digits_left;
+    int digits_right;
+    int exp_digits;
+} Dragon4_Options;
+
+PyObject *Dragon4_Positional_QuadDType(Sleef_quad *val, DigitMode digit_mode,
+                   CutoffMode cutoff_mode, int precision, int min_digits,
+                   int sign, TrimMode trim, int pad_left, int pad_right);
+
+PyObject *Dragon4_Scientific_QuadDType(Sleef_quad *val, DigitMode digit_mode, 
+                   int precision, int min_digits, int sign, TrimMode trim, 
+                   int pad_left, int exp_digits);
+
+PyObject *Dragon4_Positional(PyObject *obj, DigitMode digit_mode, 
+                   CutoffMode cutoff_mode, int precision, int min_digits, 
+                   int sign, TrimMode trim, int pad_left, int pad_right);
+
+PyObject *Dragon4_Scientific(PyObject *obj, DigitMode digit_mode, int precision,
+                   int min_digits, int sign, TrimMode trim, int pad_left,
+                   int exp_digits);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _QUADDTYPE_DRAGON4_H */
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/dtype.c b/quaddtype/numpy_quaddtype/src/dtype.c
new file mode 100644
index 00000000..7d0fad37
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/dtype.c
@@ -0,0 +1,266 @@
+#include <Python.h>
+#include <sleef.h>
+#include <sleefquad.h>
+
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/dtype_api.h"
+
+#include "scalar.h"
+#include "casts.h"
+#include "dtype.h"
+#include "dragon4.h"
+
+static inline int
+quad_load(void *x, char *data_ptr, QuadBackendType backend)
+{
+    if (data_ptr == NULL || x == NULL) {
+        return -1;
+    }
+    if (backend == BACKEND_SLEEF) {
+        *(Sleef_quad *)x = *(Sleef_quad *)data_ptr;
+    }
+    else {
+        *(long double *)x = *(long double *)data_ptr;
+    }
+    return 0;
+}
+
+static inline int
+quad_store(char *data_ptr, void *x, QuadBackendType backend)
+{
+    if (data_ptr == NULL || x == NULL) {
+        return -1;
+    }
+    if (backend == BACKEND_SLEEF) {
+        *(Sleef_quad *)data_ptr = *(Sleef_quad *)x;
+    }
+    else {
+        *(long double *)data_ptr = *(long double *)x;
+    }
+    return 0;
+}
+
+QuadPrecDTypeObject *
+new_quaddtype_instance(QuadBackendType backend)
+{
+    QuadBackendType target_backend = backend;
+    if (backend != BACKEND_SLEEF && backend != BACKEND_LONGDOUBLE) {
+        PyErr_SetString(PyExc_TypeError, "Backend must be sleef or longdouble");
+        return NULL;
+        // target_backend = BACKEND_SLEEF;
+    }
+
+    QuadPrecDTypeObject *new = (QuadPrecDTypeObject *)PyArrayDescr_Type.tp_new(
+            (PyTypeObject *)&QuadPrecDType, NULL, NULL);
+    if (new == NULL) {
+        return NULL;
+    }
+    new->base.elsize = (target_backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+    new->base.alignment =
+            (target_backend == BACKEND_SLEEF) ? _Alignof(Sleef_quad) : _Alignof(long double);
+    new->backend = target_backend;
+    return new;
+}
+
+static QuadPrecDTypeObject *
+ensure_canonical(QuadPrecDTypeObject *self)
+{
+    Py_INCREF(self);
+    return self;
+}
+
+static QuadPrecDTypeObject *
+common_instance(QuadPrecDTypeObject *dtype1, QuadPrecDTypeObject *dtype2)
+{
+    // if backend mismatch then return SLEEF one (safe to cast ld to quad)
+    if (dtype1->backend != dtype2->backend) {
+        if (dtype1->backend == BACKEND_SLEEF) {
+            Py_INCREF(dtype1);
+            return dtype1;
+        }
+
+        Py_INCREF(dtype2);
+        return dtype2;
+    }
+    Py_INCREF(dtype1);
+    return dtype1;
+}
+
+static PyArray_DTypeMeta *
+common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    // Promote integer and floating-point types to QuadPrecDType
+    if (other->type_num >= 0 &&
+        (PyTypeNum_ISINTEGER(other->type_num) || PyTypeNum_ISFLOAT(other->type_num))) {
+        Py_INCREF(cls);
+        return cls;
+    }
+    // Don't promote complex types
+    if (PyTypeNum_ISCOMPLEX(other->type_num)) {
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+
+    Py_INCREF(Py_NotImplemented);
+    return (PyArray_DTypeMeta *)Py_NotImplemented;
+}
+
+static PyArray_Descr *
+quadprec_discover_descriptor_from_pyobject(PyArray_DTypeMeta *NPY_UNUSED(cls), PyObject *obj)
+{
+    if (Py_TYPE(obj) != &QuadPrecision_Type) {
+        PyErr_SetString(PyExc_TypeError, "Can only store QuadPrecision in a QuadPrecDType array.");
+        return NULL;
+    }
+
+    QuadPrecisionObject *quad_obj = (QuadPrecisionObject *)obj;
+
+    return (PyArray_Descr *)new_quaddtype_instance(quad_obj->backend);
+}
+
+static int
+quadprec_setitem(QuadPrecDTypeObject *descr, PyObject *obj, char *dataptr)
+{
+    QuadPrecisionObject *value;
+    if (PyObject_TypeCheck(obj, &QuadPrecision_Type)) {
+        Py_INCREF(obj);
+        value = (QuadPrecisionObject *)obj;
+    }
+    else {
+        value = QuadPrecision_from_object(obj, descr->backend);
+        if (value == NULL) {
+            return -1;
+        }
+    }
+
+    if (quad_store(dataptr, &value->value, descr->backend) < 0) {
+        Py_DECREF(value);
+        char error_msg[100];
+        snprintf(error_msg, sizeof(error_msg), "Invalid memory location %p", (void *)dataptr);
+        PyErr_SetString(PyExc_ValueError, error_msg);
+        return -1;
+    }
+
+    Py_DECREF(value);
+    return 0;
+}
+
+static PyObject *
+quadprec_getitem(QuadPrecDTypeObject *descr, char *dataptr)
+{
+    QuadPrecisionObject *new = QuadPrecision_raw_new(descr->backend);
+    if (!new) {
+        return NULL;
+    }
+    if (quad_load(&new->value, dataptr, descr->backend) < 0) {
+        Py_DECREF(new);
+        char error_msg[100];
+        snprintf(error_msg, sizeof(error_msg), "Invalid memory location %p", (void *)dataptr);
+        PyErr_SetString(PyExc_ValueError, error_msg);
+        return NULL;
+    }
+    return (PyObject *)new;
+}
+
+static PyArray_Descr *
+quadprec_default_descr(PyArray_DTypeMeta *cls)
+{
+    QuadPrecDTypeObject *temp = new_quaddtype_instance(BACKEND_SLEEF);
+    return (PyArray_Descr *)temp;
+}
+
+static PyType_Slot QuadPrecDType_Slots[] = {
+        {NPY_DT_ensure_canonical, &ensure_canonical},
+        {NPY_DT_common_instance, &common_instance},
+        {NPY_DT_common_dtype, &common_dtype},
+        {NPY_DT_discover_descr_from_pyobject, &quadprec_discover_descriptor_from_pyobject},
+        {NPY_DT_setitem, &quadprec_setitem},
+        {NPY_DT_getitem, &quadprec_getitem},
+        {NPY_DT_default_descr, &quadprec_default_descr},
+        {NPY_DT_PyArray_ArrFuncs_dotfunc, NULL},
+        {0, NULL}};
+
+static PyObject *
+QuadPrecDType_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"backend", NULL};
+    const char *backend_str = "sleef";
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|s", kwlist, &backend_str)) {
+        return NULL;
+    }
+
+    QuadBackendType backend = BACKEND_SLEEF;
+    if (strcmp(backend_str, "longdouble") == 0) {
+        backend = BACKEND_LONGDOUBLE;
+    }
+    else if (strcmp(backend_str, "sleef") != 0) {
+        PyErr_SetString(PyExc_ValueError, "Invalid backend. Use 'sleef' or 'longdouble'.");
+        return NULL;
+    }
+
+    return (PyObject *)quadprec_discover_descriptor_from_pyobject(
+            &QuadPrecDType, (PyObject *)QuadPrecision_raw_new(backend));
+}
+
+static PyObject *
+QuadPrecDType_repr(QuadPrecDTypeObject *self)
+{
+    const char *backend_str = (self->backend == BACKEND_SLEEF) ? "sleef" : "longdouble";
+    return PyUnicode_FromFormat("QuadPrecDType(backend='%s')", backend_str);
+}
+
+static PyObject *
+QuadPrecDType_str(QuadPrecDTypeObject *self)
+{
+    const char *backend_str = (self->backend == BACKEND_SLEEF) ? "sleef" : "longdouble";
+    return PyUnicode_FromFormat("QuadPrecDType(backend='%s')", backend_str);
+}
+
+PyArray_DTypeMeta QuadPrecDType = {
+        {{
+                PyVarObject_HEAD_INIT(NULL, 0).tp_name = "numpy_quaddtype.QuadPrecDType",
+                .tp_basicsize = sizeof(QuadPrecDTypeObject),
+                .tp_new = QuadPrecDType_new,
+                .tp_repr = (reprfunc)QuadPrecDType_repr,
+                .tp_str = (reprfunc)QuadPrecDType_str,
+        }},
+};
+
+int
+init_quadprec_dtype(void)
+{
+    PyArrayMethod_Spec **casts = init_casts();
+    if (!casts)
+        return -1;
+
+    PyArrayDTypeMeta_Spec QuadPrecDType_DTypeSpec = {
+            .flags = NPY_DT_PARAMETRIC | NPY_DT_NUMERIC,
+            .casts = casts,
+            .typeobj = &QuadPrecision_Type,
+            .slots = QuadPrecDType_Slots,
+    };
+
+    ((PyObject *)&QuadPrecDType)->ob_type = &PyArrayDTypeMeta_Type;
+
+    ((PyTypeObject *)&QuadPrecDType)->tp_base = &PyArrayDescr_Type;
+
+    if (PyType_Ready((PyTypeObject *)&QuadPrecDType) < 0) {
+        return -1;
+    }
+
+    if (PyArrayInitDTypeMeta_FromSpec(&QuadPrecDType, &QuadPrecDType_DTypeSpec) < 0) {
+        return -1;
+    }
+
+    free_casts();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/dtype.h b/quaddtype/numpy_quaddtype/src/dtype.h
new file mode 100644
index 00000000..77a65449
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/dtype.h
@@ -0,0 +1,30 @@
+#ifndef _QUADDTYPE_DTYPE_H
+#define _QUADDTYPE_DTYPE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+#include <numpy/ndarraytypes.h>
+#include <numpy/dtype_api.h>
+#include "quad_common.h"
+
+typedef struct {
+    PyArray_Descr base;
+    QuadBackendType backend;
+} QuadPrecDTypeObject;
+
+extern PyArray_DTypeMeta QuadPrecDType;
+
+QuadPrecDTypeObject *
+new_quaddtype_instance(QuadBackendType backend);
+
+int
+init_quadprec_dtype(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/ops.hpp b/quaddtype/numpy_quaddtype/src/ops.hpp
new file mode 100644
index 00000000..b32ae386
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/ops.hpp
@@ -0,0 +1,453 @@
+#include <sleef.h>
+#include <sleefquad.h>
+#include <cmath>
+
+// Unary Quad Operations
+typedef Sleef_quad (*unary_op_quad_def)(Sleef_quad *);
+
+static Sleef_quad
+quad_negative(Sleef_quad *op)
+{
+    return Sleef_negq1(*op);
+}
+
+static Sleef_quad
+quad_positive(Sleef_quad *op)
+{
+    return *op;
+}
+
+static inline Sleef_quad
+quad_absolute(Sleef_quad *op)
+{
+    return Sleef_fabsq1(*op);
+}
+
+static inline Sleef_quad
+quad_rint(Sleef_quad *op)
+{
+    return Sleef_rintq1(*op);
+}
+
+static inline Sleef_quad
+quad_trunc(Sleef_quad *op)
+{
+    return Sleef_truncq1(*op);
+}
+
+static inline Sleef_quad
+quad_floor(Sleef_quad *op)
+{
+    return Sleef_floorq1(*op);
+}
+
+static inline Sleef_quad
+quad_ceil(Sleef_quad *op)
+{
+    return Sleef_ceilq1(*op);
+}
+
+static inline Sleef_quad
+quad_sqrt(Sleef_quad *op)
+{
+    return Sleef_sqrtq1_u05(*op);
+}
+
+static inline Sleef_quad
+quad_square(Sleef_quad *op)
+{
+    return Sleef_mulq1_u05(*op, *op);
+}
+
+static inline Sleef_quad
+quad_log(Sleef_quad *op)
+{
+    return Sleef_logq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_log2(Sleef_quad *op)
+{
+    return Sleef_log2q1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_log10(Sleef_quad *op)
+{
+    return Sleef_log10q1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_log1p(Sleef_quad *op)
+{
+    return Sleef_log1pq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_exp(Sleef_quad *op)
+{
+    return Sleef_expq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_exp2(Sleef_quad *op)
+{
+    return Sleef_exp2q1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_sin(Sleef_quad *op)
+{
+    return Sleef_sinq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_cos(Sleef_quad *op)
+{
+    return Sleef_cosq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_tan(Sleef_quad *op)
+{
+    return Sleef_tanq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_asin(Sleef_quad *op)
+{
+    return Sleef_asinq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_acos(Sleef_quad *op)
+{
+    return Sleef_acosq1_u10(*op);
+}
+
+static inline Sleef_quad
+quad_atan(Sleef_quad *op)
+{
+    return Sleef_atanq1_u10(*op);
+}
+
+// Unary long double operations
+typedef long double (*unary_op_longdouble_def)(long double *);
+
+static inline long double
+ld_negative(long double *op)
+{
+    return -(*op);
+}
+
+static inline long double
+ld_positive(long double *op)
+{
+    return *op;
+}
+
+static inline long double
+ld_absolute(long double *op)
+{
+    return fabsl(*op);
+}
+
+static inline long double
+ld_rint(long double *op)
+{
+    return rintl(*op);
+}
+
+static inline long double
+ld_trunc(long double *op)
+{
+    return truncl(*op);
+}
+
+static inline long double
+ld_floor(long double *op)
+{
+    return floorl(*op);
+}
+
+static inline long double
+ld_ceil(long double *op)
+{
+    return ceill(*op);
+}
+
+static inline long double
+ld_sqrt(long double *op)
+{
+    return sqrtl(*op);
+}
+
+static inline long double
+ld_square(long double *op)
+{
+    return (*op) * (*op);
+}
+
+static inline long double
+ld_log(long double *op)
+{
+    return logl(*op);
+}
+
+static inline long double
+ld_log2(long double *op)
+{
+    return log2l(*op);
+}
+
+static inline long double
+ld_log10(long double *op)
+{
+    return log10l(*op);
+}
+
+static inline long double
+ld_log1p(long double *op)
+{
+    return log1pl(*op);
+}
+
+static inline long double
+ld_exp(long double *op)
+{
+    return expl(*op);
+}
+
+static inline long double
+ld_exp2(long double *op)
+{
+    return exp2l(*op);
+}
+
+static inline long double
+ld_sin(long double *op)
+{
+    return sinl(*op);
+}
+
+static inline long double
+ld_cos(long double *op)
+{
+    return cosl(*op);
+}
+
+static inline long double
+ld_tan(long double *op)
+{
+    return tanl(*op);
+}
+
+static inline long double
+ld_asin(long double *op)
+{
+    return asinl(*op);
+}
+
+static inline long double
+ld_acos(long double *op)
+{
+    return acosl(*op);
+}
+
+static inline long double
+ld_atan(long double *op)
+{
+    return atanl(*op);
+}
+
+// Binary Quad operations
+typedef Sleef_quad (*binary_op_quad_def)(Sleef_quad *, Sleef_quad *);
+
+static inline Sleef_quad
+quad_add(Sleef_quad *in1, Sleef_quad *in2)
+{
+    return Sleef_addq1_u05(*in1, *in2);
+}
+
+static inline Sleef_quad
+quad_sub(Sleef_quad *in1, Sleef_quad *in2)
+{
+    return Sleef_subq1_u05(*in1, *in2);
+}
+
+static inline Sleef_quad
+quad_mul(Sleef_quad *a, Sleef_quad *b)
+{
+    return Sleef_mulq1_u05(*a, *b);
+}
+
+static inline Sleef_quad
+quad_div(Sleef_quad *a, Sleef_quad *b)
+{
+    return Sleef_divq1_u05(*a, *b);
+}
+
+static inline Sleef_quad
+quad_pow(Sleef_quad *a, Sleef_quad *b)
+{
+    return Sleef_powq1_u10(*a, *b);
+}
+
+static inline Sleef_quad
+quad_mod(Sleef_quad *a, Sleef_quad *b)
+{
+    return Sleef_fmodq1(*a, *b);
+}
+
+static inline Sleef_quad
+quad_minimum(Sleef_quad *in1, Sleef_quad *in2)
+{
+    return Sleef_icmpleq1(*in1, *in2) ? *in1 : *in2;
+}
+
+static inline Sleef_quad
+quad_maximum(Sleef_quad *in1, Sleef_quad *in2)
+{
+    return Sleef_icmpgeq1(*in1, *in2) ? *in1 : *in2;
+}
+
+static inline Sleef_quad
+quad_atan2(Sleef_quad *in1, Sleef_quad *in2)
+{
+    return Sleef_atan2q1_u10(*in1, *in2);
+}
+
+// Binary long double operations
+typedef long double (*binary_op_longdouble_def)(long double *, long double *);
+
+static inline long double
+ld_add(long double *in1, long double *in2)
+{
+    return (*in1) + (*in2);
+}
+
+static inline long double
+ld_sub(long double *in1, long double *in2)
+{
+    return (*in1) - (*in2);
+}
+
+static inline long double
+ld_mul(long double *a, long double *b)
+{
+    return (*a) * (*b);
+}
+
+static inline long double
+ld_div(long double *a, long double *b)
+{
+    return (*a) / (*b);
+}
+
+static inline long double
+ld_pow(long double *a, long double *b)
+{
+    return powl(*a, *b);
+}
+
+static inline long double
+ld_mod(long double *a, long double *b)
+{
+    return fmodl(*a, *b);
+}
+
+static inline long double
+ld_minimum(long double *in1, long double *in2)
+{
+    return (*in1 < *in2) ? *in1 : *in2;
+}
+
+static inline long double
+ld_maximum(long double *in1, long double *in2)
+{
+    return (*in1 > *in2) ? *in1 : *in2;
+}
+
+static inline long double
+ld_atan2(long double *in1, long double *in2)
+{
+    return atan2l(*in1, *in2);
+}
+
+// comparison quad functions
+typedef npy_bool (*cmp_quad_def)(const Sleef_quad *, const Sleef_quad *);
+
+static inline npy_bool
+quad_equal(const Sleef_quad *a, const Sleef_quad *b)
+{
+    return Sleef_icmpeqq1(*a, *b);
+}
+
+static inline npy_bool
+quad_notequal(const Sleef_quad *a, const Sleef_quad *b)
+{
+    return Sleef_icmpneq1(*a, *b);
+}
+
+static inline npy_bool
+quad_less(const Sleef_quad *a, const Sleef_quad *b)
+{
+    return Sleef_icmpltq1(*a, *b);
+}
+
+static inline npy_bool
+quad_lessequal(const Sleef_quad *a, const Sleef_quad *b)
+{
+    return Sleef_icmpleq1(*a, *b);
+}
+
+static inline npy_bool
+quad_greater(const Sleef_quad *a, const Sleef_quad *b)
+{
+    return Sleef_icmpgtq1(*a, *b);
+}
+
+static inline npy_bool
+quad_greaterequal(const Sleef_quad *a, const Sleef_quad *b)
+{
+    return Sleef_icmpgeq1(*a, *b);
+}
+
+// comparison quad functions
+typedef npy_bool (*cmp_londouble_def)(const long double *, const long double *);
+
+static inline npy_bool
+ld_equal(const long double *a, const long double *b)
+{
+    return *a == *b;
+}
+
+static inline npy_bool
+ld_notequal(const long double *a, const long double *b)
+{
+    return *a != *b;
+}
+
+static inline npy_bool
+ld_less(const long double *a, const long double *b)
+{
+    return *a < *b;
+}
+
+static inline npy_bool
+ld_lessequal(const long double *a, const long double *b)
+{
+    return *a <= *b;
+}
+
+static inline npy_bool
+ld_greater(const long double *a, const long double *b)
+{
+    return *a > *b;
+}
+
+static inline npy_bool
+ld_greaterequal(const long double *a, const long double *b)
+{
+    return *a >= *b;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quad_common.h b/quaddtype/numpy_quaddtype/src/quad_common.h
new file mode 100644
index 00000000..bc578a29
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/quad_common.h
@@ -0,0 +1,18 @@
+#ifndef _QUADDTYPE_COMMON_H
+#define _QUADDTYPE_COMMON_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+    BACKEND_INVALID = -1,
+    BACKEND_SLEEF,
+    BACKEND_LONGDOUBLE
+} QuadBackendType;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/quaddtype_main.c b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
new file mode 100644
index 00000000..f2935299
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/quaddtype_main.c
@@ -0,0 +1,116 @@
+#include <Python.h>
+#include <sleef.h>
+#include <sleefquad.h>
+#include <string.h>
+
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+
+#include "numpy/arrayobject.h"
+#include "numpy/dtype_api.h"
+#include "numpy/ufuncobject.h"
+
+#include "scalar.h"
+#include "dtype.h"
+#include "umath.h"
+#include "quad_common.h"
+#include "float.h"
+
+
+static PyObject* py_is_longdouble_128(PyObject* self, PyObject* args) {
+    if(sizeof(long double) == 16 && 
+        LDBL_MANT_DIG == 113 && 
+        LDBL_MAX_EXP == 16384) {
+        Py_RETURN_TRUE;
+    } else {
+        Py_RETURN_FALSE;
+    }
+}
+
+static PyObject* get_sleef_constant(PyObject* self, PyObject* args) {
+    const char* constant_name;
+    if (!PyArg_ParseTuple(args, "s", &constant_name)) {
+        return NULL;
+    }
+
+    QuadPrecisionObject* result = QuadPrecision_raw_new(BACKEND_SLEEF);
+    if (result == NULL) {
+        return NULL;
+    }
+
+    if (strcmp(constant_name, "pi") == 0) {
+        result->value.sleef_value = SLEEF_M_PIq;
+    } else if (strcmp(constant_name, "e") == 0) {
+        result->value.sleef_value = SLEEF_M_Eq;
+    } else if (strcmp(constant_name, "log2e") == 0) {
+        result->value.sleef_value = SLEEF_M_LOG2Eq;
+    } else if (strcmp(constant_name, "log10e") == 0) {
+        result->value.sleef_value = SLEEF_M_LOG10Eq;
+    } else if (strcmp(constant_name, "ln2") == 0) {
+        result->value.sleef_value = SLEEF_M_LN2q;
+    } else if (strcmp(constant_name, "ln10") == 0) {
+        result->value.sleef_value = SLEEF_M_LN10q;
+    } else if (strcmp(constant_name, "quad_max") == 0) {
+        result->value.sleef_value = SLEEF_QUAD_MAX;
+    } else if (strcmp(constant_name, "quad_min") == 0) {
+        result->value.sleef_value = SLEEF_QUAD_MIN;
+    } else if (strcmp(constant_name, "epsilon") == 0) {
+        result->value.sleef_value = SLEEF_QUAD_EPSILON;
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, "Unknown constant name");
+        Py_DECREF(result);
+        return NULL;
+    }
+
+    return (PyObject*)result;
+}
+
+static PyMethodDef module_methods[] = {
+    {"is_longdouble_128", py_is_longdouble_128, METH_NOARGS, "Check if long double is 128-bit"},
+    {"get_sleef_constant", get_sleef_constant, METH_VARARGS, "Get Sleef constant by name"},
+    {NULL, NULL, 0, NULL} 
+};
+
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        .m_name = "_quaddtype_main",
+        .m_doc = "Quad (128-bit) floating point Data Type for NumPy with multiple backends",
+        .m_size = -1,
+        .m_methods = module_methods
+};
+
+PyMODINIT_FUNC
+PyInit__quaddtype_main(void)
+{
+    import_array();
+    import_umath();
+    PyObject *m = PyModule_Create(&moduledef);
+    if (!m) {
+        return NULL;
+    }
+
+    if (init_quadprecision_scalar() < 0)
+        goto error;
+
+    if (PyModule_AddObject(m, "QuadPrecision", (PyObject *)&QuadPrecision_Type) < 0)
+        goto error;
+
+    if (init_quadprec_dtype() < 0)
+        goto error;
+
+    if (PyModule_AddObject(m, "QuadPrecDType", (PyObject *)&QuadPrecDType) < 0)
+        goto error;
+
+    if (init_quad_umath() < 0) {
+        goto error;
+    }
+
+    return m;
+
+error:
+    Py_XDECREF(m);
+    return NULL;
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/scalar.c b/quaddtype/numpy_quaddtype/src/scalar.c
new file mode 100644
index 00000000..9bec08a6
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/scalar.c
@@ -0,0 +1,254 @@
+#include <Python.h>
+#include <sleef.h>
+#include <sleefquad.h>
+#include <stdlib.h>
+
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/dtype_api.h"
+
+#include "scalar.h"
+#include "scalar_ops.h"
+#include "dragon4.h"
+
+QuadPrecisionObject *
+QuadPrecision_raw_new(QuadBackendType backend)
+{
+    QuadPrecisionObject *new = PyObject_New(QuadPrecisionObject, &QuadPrecision_Type);
+    if (!new)
+        return NULL;
+    new->backend = backend;
+    if (backend == BACKEND_SLEEF) {
+        new->value.sleef_value = Sleef_cast_from_doubleq1(0.0);
+    }
+    else {
+        new->value.longdouble_value = 0.0L;
+    }
+    return new;
+}
+
+QuadPrecisionObject *
+QuadPrecision_from_object(PyObject *value, QuadBackendType backend)
+{
+    QuadPrecisionObject *self = QuadPrecision_raw_new(backend);
+    if (!self)
+        return NULL;
+
+    if (PyFloat_Check(value)) {
+        double dval = PyFloat_AsDouble(value);
+        if (backend == BACKEND_SLEEF) {
+            self->value.sleef_value = Sleef_cast_from_doubleq1(dval);
+        }
+        else {
+            self->value.longdouble_value = (long double)dval;
+        }
+    }
+    else if (PyUnicode_CheckExact(value)) {
+        const char *s = PyUnicode_AsUTF8(value);
+        char *endptr = NULL;
+        if (backend == BACKEND_SLEEF) {
+            self->value.sleef_value = Sleef_strtoq(s, &endptr);
+        }
+        else {
+            self->value.longdouble_value = strtold(s, &endptr);
+        }
+        if (*endptr != '\0' || endptr == s) {
+            PyErr_SetString(PyExc_ValueError, "Unable to parse string to QuadPrecision");
+            Py_DECREF(self);
+            return NULL;
+        }
+    }
+    else if (PyLong_Check(value)) {
+        long long val = PyLong_AsLongLong(value);
+        if (val == -1 && PyErr_Occurred()) {
+            PyErr_SetString(PyExc_OverflowError, "Overflow Error, value out of range");
+            Py_DECREF(self);
+            return NULL;
+        }
+        if (backend == BACKEND_SLEEF) {
+            self->value.sleef_value = Sleef_cast_from_int64q1(val);
+        }
+        else {
+            self->value.longdouble_value = (long double)val;
+        }
+    }
+    else if (Py_TYPE(value) == &QuadPrecision_Type) {
+        Py_DECREF(self);  // discard the default one
+        QuadPrecisionObject *quad_obj = (QuadPrecisionObject *)value;
+        // create a new one with the same backend
+        QuadPrecisionObject *self = QuadPrecision_raw_new(quad_obj->backend);
+        if (quad_obj->backend == BACKEND_SLEEF) {
+            self->value.sleef_value = quad_obj->value.sleef_value;
+        }
+        else {
+            self->value.longdouble_value = quad_obj->value.longdouble_value;
+        }
+
+        return self;
+    }
+    else {
+        PyObject *type_str = PyObject_Str((PyObject *)Py_TYPE(value));
+        if (type_str != NULL) {
+            const char *type_cstr = PyUnicode_AsUTF8(type_str);
+            if (type_cstr != NULL) {
+                PyErr_Format(PyExc_TypeError,
+                             "QuadPrecision value must be a quad, float, int or string, but got %s "
+                             "instead",
+                             type_cstr);
+            }
+            else {
+                PyErr_SetString(
+                        PyExc_TypeError,
+                        "QuadPrecision value must be a quad, float, int or string, but got an "
+                        "unknown type instead");
+            }
+            Py_DECREF(type_str);
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                            "QuadPrecision value must be a quad, float, int or string, but got an "
+                            "unknown type instead");
+        }
+        Py_DECREF(self);
+        return NULL;
+    }
+
+    return self;
+}
+
+static PyObject *
+QuadPrecision_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs)
+{
+    PyObject *value;
+    const char *backend_str = "sleef";
+    static char *kwlist[] = {"value", "backend", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", kwlist, &value, &backend_str)) {
+        return NULL;
+    }
+
+    QuadBackendType backend = BACKEND_SLEEF;
+    if (strcmp(backend_str, "longdouble") == 0) {
+        backend = BACKEND_LONGDOUBLE;
+    }
+    else if (strcmp(backend_str, "sleef") != 0) {
+        PyErr_SetString(PyExc_ValueError, "Invalid backend. Use 'sleef' or 'longdouble'.");
+        return NULL;
+    }
+
+    return (PyObject *)QuadPrecision_from_object(value, backend);
+}
+
+static PyObject *
+QuadPrecision_str_dragon4(QuadPrecisionObject *self)
+{
+    Dragon4_Options opt = {.scientific = 0,
+                           .digit_mode = DigitMode_Unique,
+                           .cutoff_mode = CutoffMode_TotalLength,
+                           .precision = SLEEF_QUAD_DIG,
+                           .sign = 1,
+                           .trim_mode = TrimMode_LeaveOneZero,
+                           .digits_left = 1,
+                           .digits_right = SLEEF_QUAD_DIG};
+
+    if (self->backend == BACKEND_SLEEF) {
+        return Dragon4_Positional_QuadDType(
+                &self->value.sleef_value, opt.digit_mode, opt.cutoff_mode, opt.precision,
+                opt.min_digits, opt.sign, opt.trim_mode, opt.digits_left, opt.digits_right);
+    }
+    else {
+        Sleef_quad sleef_val = Sleef_cast_from_doubleq1(self->value.longdouble_value);
+        return Dragon4_Positional_QuadDType(&sleef_val, opt.digit_mode, opt.cutoff_mode,
+                                            opt.precision, opt.min_digits, opt.sign, opt.trim_mode,
+                                            opt.digits_left, opt.digits_right);
+    }
+}
+
+static PyObject *
+QuadPrecision_str(QuadPrecisionObject *self)
+{
+    char buffer[128];
+    if (self->backend == BACKEND_SLEEF) {
+        Sleef_snprintf(buffer, sizeof(buffer), "%.*Qe", SLEEF_QUAD_DIG, self->value.sleef_value);
+    }
+    else {
+        snprintf(buffer, sizeof(buffer), "%.35Le", self->value.longdouble_value);
+    }
+    return PyUnicode_FromString(buffer);
+}
+
+static PyObject *
+QuadPrecision_repr(QuadPrecisionObject *self)
+{
+    PyObject *str = QuadPrecision_str(self);
+    if (str == NULL) {
+        return NULL;
+    }
+    const char *backend_str = (self->backend == BACKEND_SLEEF) ? "sleef" : "longdouble";
+    PyObject *res = PyUnicode_FromFormat("QuadPrecision('%S', backend='%s')", str, backend_str);
+    Py_DECREF(str);
+    return res;
+}
+
+static PyObject *
+QuadPrecision_repr_dragon4(QuadPrecisionObject *self)
+{
+    Dragon4_Options opt = {.scientific = 1,
+                           .digit_mode = DigitMode_Unique,
+                           .cutoff_mode = CutoffMode_TotalLength,
+                           .precision = SLEEF_QUAD_DIG,
+                           .sign = 1,
+                           .trim_mode = TrimMode_LeaveOneZero,
+                           .digits_left = 1,
+                           .exp_digits = 3};
+
+    PyObject *str;
+    if (self->backend == BACKEND_SLEEF) {
+        str = Dragon4_Scientific_QuadDType(&self->value.sleef_value, opt.digit_mode, opt.precision,
+                                           opt.min_digits, opt.sign, opt.trim_mode, opt.digits_left,
+                                           opt.exp_digits);
+    }
+    else {
+        Sleef_quad sleef_val = Sleef_cast_from_doubleq1(self->value.longdouble_value);
+        str = Dragon4_Scientific_QuadDType(&sleef_val, opt.digit_mode, opt.precision,
+                                           opt.min_digits, opt.sign, opt.trim_mode, opt.digits_left,
+                                           opt.exp_digits);
+    }
+
+    if (str == NULL) {
+        return NULL;
+    }
+
+    const char *backend_str = (self->backend == BACKEND_SLEEF) ? "sleef" : "longdouble";
+    PyObject *res = PyUnicode_FromFormat("QuadPrecision('%S', backend='%s')", str, backend_str);
+    Py_DECREF(str);
+    return res;
+}
+
+static void
+QuadPrecision_dealloc(QuadPrecisionObject *self)
+{
+    Py_TYPE(self)->tp_free((PyObject *)self);
+}
+
+PyTypeObject QuadPrecision_Type = {
+        PyVarObject_HEAD_INIT(NULL, 0).tp_name = "numpy_quaddtype.QuadPrecision",
+        .tp_basicsize = sizeof(QuadPrecisionObject),
+        .tp_itemsize = 0,
+        .tp_new = QuadPrecision_new,
+        .tp_dealloc = (destructor)QuadPrecision_dealloc,
+        .tp_repr = (reprfunc)QuadPrecision_repr_dragon4,
+        .tp_str = (reprfunc)QuadPrecision_str_dragon4,
+        .tp_as_number = &quad_as_scalar,
+        .tp_richcompare = (richcmpfunc)quad_richcompare,
+};
+
+int
+init_quadprecision_scalar(void)
+{
+    return PyType_Ready(&QuadPrecision_Type);
+}
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/scalar.h b/quaddtype/numpy_quaddtype/src/scalar.h
new file mode 100644
index 00000000..4fac1adf
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/scalar.h
@@ -0,0 +1,41 @@
+#ifndef _QUADDTYPE_SCALAR_H
+#define _QUADDTYPE_SCALAR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+#include <sleef.h>
+#include "quad_common.h"
+
+typedef union {
+    Sleef_quad sleef_value;
+    long double longdouble_value;
+} quad_value;
+
+typedef struct {
+    PyObject_HEAD
+    quad_value value;
+    QuadBackendType backend;
+} QuadPrecisionObject;
+
+extern PyTypeObject QuadPrecision_Type;
+
+QuadPrecisionObject *
+QuadPrecision_raw_new(QuadBackendType backend);
+
+QuadPrecisionObject *
+QuadPrecision_from_object(PyObject *value, QuadBackendType backend);
+
+int
+init_quadprecision_scalar(void);
+
+#define PyArray_IsScalar(obj, QuadPrecDType) PyObject_TypeCheck(obj, &QuadPrecision_Type)
+#define PyArrayScalar_VAL(obj, QuadPrecDType) (((QuadPrecisionObject *)obj)->value)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/quaddtype/numpy_quaddtype/src/scalar_ops.cpp b/quaddtype/numpy_quaddtype/src/scalar_ops.cpp
new file mode 100644
index 00000000..5888ad60
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/scalar_ops.cpp
@@ -0,0 +1,243 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+
+extern "C" {
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+
+#include "numpy/dtype_api.h"
+}
+
+#include "scalar.h"
+#include "ops.hpp"
+#include "scalar_ops.h"
+#include "quad_common.h"
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+static PyObject *
+quad_unary_func(QuadPrecisionObject *self)
+{
+    QuadPrecisionObject *res = QuadPrecision_raw_new(self->backend);
+    if (!res) {
+        return NULL;
+    }
+
+    if (self->backend == BACKEND_SLEEF) {
+        res->value.sleef_value = sleef_op(&self->value.sleef_value);
+    }
+    else {
+        res->value.longdouble_value = longdouble_op(&self->value.longdouble_value);
+    }
+    return (PyObject *)res;
+}
+
+PyObject *
+quad_nonzero(QuadPrecisionObject *self)
+{
+    if (self->backend == BACKEND_SLEEF) {
+        return PyBool_FromLong(Sleef_icmpneq1(self->value.sleef_value, Sleef_cast_from_int64q1(0)));
+    }
+    else {
+        return PyBool_FromLong(self->value.longdouble_value != 0.0L);
+    }
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+static PyObject *
+quad_binary_func(PyObject *op1, PyObject *op2)
+{
+    QuadPrecisionObject *self;
+    PyObject *other;
+    QuadPrecisionObject *other_quad = NULL;
+    int is_forward;
+    QuadBackendType backend;
+
+    if (PyObject_TypeCheck(op1, &QuadPrecision_Type)) {
+        is_forward = 1;
+        self = (QuadPrecisionObject *)op1;
+        other = Py_NewRef(op2);
+        backend = self->backend;
+    }
+    else {
+        is_forward = 0;
+        self = (QuadPrecisionObject *)op2;
+        other = Py_NewRef(op1);
+        backend = self->backend;
+    }
+
+    if (PyObject_TypeCheck(other, &QuadPrecision_Type)) {
+        Py_INCREF(other);
+        other_quad = (QuadPrecisionObject *)other;
+        if (other_quad->backend != backend) {
+            PyErr_SetString(PyExc_TypeError, "Cannot mix QuadPrecision backends");
+            Py_DECREF(other);
+            return NULL;
+        }
+    }
+    else if (PyLong_Check(other) || PyFloat_Check(other)) {
+        other_quad = QuadPrecision_from_object(other, backend);
+        if (!other_quad) {
+            Py_DECREF(other);
+            return NULL;
+        }
+    }
+    else {
+        Py_DECREF(other);
+        Py_RETURN_NOTIMPLEMENTED;
+    }
+
+    QuadPrecisionObject *res = QuadPrecision_raw_new(backend);
+    if (!res) {
+        Py_DECREF(other_quad);
+        Py_DECREF(other);
+        return NULL;
+    }
+
+    if (backend == BACKEND_SLEEF) {
+        if (is_forward) {
+            res->value.sleef_value =
+                    sleef_op(&self->value.sleef_value, &other_quad->value.sleef_value);
+        }
+        else {
+            res->value.sleef_value =
+                    sleef_op(&other_quad->value.sleef_value, &self->value.sleef_value);
+        }
+    }
+    else {
+        if (is_forward) {
+            res->value.longdouble_value = longdouble_op(&self->value.longdouble_value,
+                                                        &other_quad->value.longdouble_value);
+        }
+        else {
+            res->value.longdouble_value = longdouble_op(&other_quad->value.longdouble_value,
+                                                        &self->value.longdouble_value);
+        }
+    }
+
+    Py_DECREF(other_quad);
+    Py_DECREF(other);
+    return (PyObject *)res;
+}
+
+PyObject *
+quad_richcompare(QuadPrecisionObject *self, PyObject *other, int cmp_op)
+{
+    QuadPrecisionObject *other_quad = NULL;
+    QuadBackendType backend = self->backend;
+
+    if (PyObject_TypeCheck(other, &QuadPrecision_Type)) {
+        Py_INCREF(other);
+        other_quad = (QuadPrecisionObject *)other;
+        if (other_quad->backend != backend) {
+            PyErr_SetString(PyExc_TypeError,
+                            "Cannot compare QuadPrecision objects with different backends");
+            Py_DECREF(other_quad);
+            return NULL;
+        }
+    }
+    else if (PyLong_CheckExact(other) || PyFloat_CheckExact(other)) {
+        other_quad = QuadPrecision_from_object(other, backend);
+        if (other_quad == NULL) {
+            return NULL;
+        }
+    }
+    else {
+        Py_RETURN_NOTIMPLEMENTED;
+    }
+
+    int cmp;
+    if (backend == BACKEND_SLEEF) {
+        switch (cmp_op) {
+            case Py_LT:
+                cmp = Sleef_icmpltq1(self->value.sleef_value, other_quad->value.sleef_value);
+                break;
+            case Py_LE:
+                cmp = Sleef_icmpleq1(self->value.sleef_value, other_quad->value.sleef_value);
+                break;
+            case Py_EQ:
+                cmp = Sleef_icmpeqq1(self->value.sleef_value, other_quad->value.sleef_value);
+                break;
+            case Py_NE:
+                cmp = Sleef_icmpneq1(self->value.sleef_value, other_quad->value.sleef_value);
+                break;
+            case Py_GT:
+                cmp = Sleef_icmpgtq1(self->value.sleef_value, other_quad->value.sleef_value);
+                break;
+            case Py_GE:
+                cmp = Sleef_icmpgeq1(self->value.sleef_value, other_quad->value.sleef_value);
+                break;
+            default:
+                Py_DECREF(other_quad);
+                Py_RETURN_NOTIMPLEMENTED;
+        }
+    }
+    else {
+        switch (cmp_op) {
+            case Py_LT:
+                cmp = self->value.longdouble_value < other_quad->value.longdouble_value;
+                break;
+            case Py_LE:
+                cmp = self->value.longdouble_value <= other_quad->value.longdouble_value;
+                break;
+            case Py_EQ:
+                cmp = self->value.longdouble_value == other_quad->value.longdouble_value;
+                break;
+            case Py_NE:
+                cmp = self->value.longdouble_value != other_quad->value.longdouble_value;
+                break;
+            case Py_GT:
+                cmp = self->value.longdouble_value > other_quad->value.longdouble_value;
+                break;
+            case Py_GE:
+                cmp = self->value.longdouble_value >= other_quad->value.longdouble_value;
+                break;
+            default:
+                Py_DECREF(other_quad);
+                Py_RETURN_NOTIMPLEMENTED;
+        }
+    }
+    Py_DECREF(other_quad);
+
+    return PyBool_FromLong(cmp);
+}
+
+static PyObject *
+QuadPrecision_float(QuadPrecisionObject *self)
+{
+    if (self->backend == BACKEND_SLEEF) {
+        return PyFloat_FromDouble(Sleef_cast_to_doubleq1(self->value.sleef_value));
+    }
+    else {
+        return PyFloat_FromDouble((double)self->value.longdouble_value);
+    }
+}
+
+static PyObject *
+QuadPrecision_int(QuadPrecisionObject *self)
+{
+    if (self->backend == BACKEND_SLEEF) {
+        return PyLong_FromLongLong(Sleef_cast_to_int64q1(self->value.sleef_value));
+    }
+    else {
+        return PyLong_FromLongLong((long long)self->value.longdouble_value);
+    }
+}
+
+PyNumberMethods quad_as_scalar = {
+        .nb_add = (binaryfunc)quad_binary_func<quad_add, ld_add>,
+        .nb_subtract = (binaryfunc)quad_binary_func<quad_sub, ld_sub>,
+        .nb_multiply = (binaryfunc)quad_binary_func<quad_mul, ld_mul>,
+        .nb_power = (ternaryfunc)quad_binary_func<quad_pow, ld_pow>,
+        .nb_negative = (unaryfunc)quad_unary_func<quad_negative, ld_negative>,
+        .nb_positive = (unaryfunc)quad_unary_func<quad_positive, ld_positive>,
+        .nb_absolute = (unaryfunc)quad_unary_func<quad_absolute, ld_absolute>,
+        .nb_bool = (inquiry)quad_nonzero,
+        .nb_int = (unaryfunc)QuadPrecision_int,
+        .nb_float = (unaryfunc)QuadPrecision_float,
+        .nb_true_divide = (binaryfunc)quad_binary_func<quad_div, ld_div>,
+};
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/scalar_ops.h b/quaddtype/numpy_quaddtype/src/scalar_ops.h
similarity index 100%
rename from quaddtype/quaddtype/src/scalar_ops.h
rename to quaddtype/numpy_quaddtype/src/scalar_ops.h
diff --git a/quaddtype/numpy_quaddtype/src/umath.cpp b/quaddtype/numpy_quaddtype/src/umath.cpp
new file mode 100644
index 00000000..0058236a
--- /dev/null
+++ b/quaddtype/numpy_quaddtype/src/umath.cpp
@@ -0,0 +1,806 @@
+#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
+#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
+#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
+#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
+#define NO_IMPORT_ARRAY
+#define NO_IMPORT_UFUNC
+
+extern "C" {
+#include <Python.h>
+#include <cstdio>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+
+#include "numpy/dtype_api.h"
+}
+#include "quad_common.h"
+#include "scalar.h"
+#include "dtype.h"
+#include "umath.h"
+#include "ops.hpp"
+
+// helper debugging function
+static const char *
+get_dtype_name(PyArray_DTypeMeta *dtype)
+{
+    if (dtype == &QuadPrecDType) {
+        return "QuadPrecDType";
+    }
+    else if (dtype == &PyArray_BoolDType) {
+        return "BoolDType";
+    }
+    else if (dtype == &PyArray_ByteDType) {
+        return "ByteDType";
+    }
+    else if (dtype == &PyArray_UByteDType) {
+        return "UByteDType";
+    }
+    else if (dtype == &PyArray_ShortDType) {
+        return "ShortDType";
+    }
+    else if (dtype == &PyArray_UShortDType) {
+        return "UShortDType";
+    }
+    else if (dtype == &PyArray_IntDType) {
+        return "IntDType";
+    }
+    else if (dtype == &PyArray_UIntDType) {
+        return "UIntDType";
+    }
+    else if (dtype == &PyArray_LongDType) {
+        return "LongDType";
+    }
+    else if (dtype == &PyArray_ULongDType) {
+        return "ULongDType";
+    }
+    else if (dtype == &PyArray_LongLongDType) {
+        return "LongLongDType";
+    }
+    else if (dtype == &PyArray_ULongLongDType) {
+        return "ULongLongDType";
+    }
+    else if (dtype == &PyArray_FloatDType) {
+        return "FloatDType";
+    }
+    else if (dtype == &PyArray_DoubleDType) {
+        return "DoubleDType";
+    }
+    else if (dtype == &PyArray_LongDoubleDType) {
+        return "LongDoubleDType";
+    }
+    else {
+        return "UnknownDType";
+    }
+}
+
+static NPY_CASTING
+quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                  PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
+                                  npy_intp *NPY_UNUSED(view_offset))
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        Py_INCREF(given_descrs[0]);
+        loop_descrs[1] = given_descrs[0];
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    QuadPrecDTypeObject *descr_in = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)loop_descrs[1];
+
+    if (descr_in->backend != descr_out->backend) {
+        return NPY_UNSAFE_CASTING;
+    }
+
+    return NPY_NO_CASTING;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+quad_generic_unary_op_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                             npy_intp const dimensions[], npy_intp const strides[],
+                                             NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in, out;
+    while (N--) {
+        memcpy(&in, in_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in_ptr += in_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+quad_generic_unary_op_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                           npy_intp const dimensions[], npy_intp const strides[],
+                                           NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in_ptr = data[0];
+    char *out_ptr = data[1];
+    npy_intp in_stride = strides[0];
+    npy_intp out_stride = strides[1];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in_ptr);
+        }
+        in_ptr += in_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <unary_op_quad_def sleef_op, unary_op_longdouble_def longdouble_op>
+int
+create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_unary_op_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_unary_op_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_unary_op",
+            .nin = 1,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int
+init_quad_unary_ops(PyObject *numpy)
+{
+    if (create_quad_unary_ufunc<quad_negative, ld_negative>(numpy, "negative") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_positive, ld_positive>(numpy, "positive") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_absolute, ld_absolute>(numpy, "absolute") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_rint, ld_rint>(numpy, "rint") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_trunc, ld_trunc>(numpy, "trunc") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_floor, ld_floor>(numpy, "floor") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_ceil, ld_ceil>(numpy, "ceil") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_sqrt, ld_sqrt>(numpy, "sqrt") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_square, ld_square>(numpy, "square") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log, ld_log>(numpy, "log") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log2, ld_log2>(numpy, "log2") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log10, ld_log10>(numpy, "log10") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_log1p, ld_log1p>(numpy, "log1p") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_exp, ld_exp>(numpy, "exp") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_exp2, ld_exp2>(numpy, "exp2") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_sin, ld_sin>(numpy, "sin") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_cos, ld_cos>(numpy, "cos") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_tan, ld_tan>(numpy, "tan") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_asin, ld_asin>(numpy, "arcsin") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_acos, ld_acos>(numpy, "arccos") < 0) {
+        return -1;
+    }
+    if (create_quad_unary_ufunc<quad_atan, ld_atan>(numpy, "arctan") < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+// Binary ufuncs
+
+static NPY_CASTING
+quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                   PyArray_Descr *const given_descrs[],
+                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
+{
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
+
+    // Determine target backend and if casting is needed
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
+
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
+        }
+    }
+
+    // Set up output descriptor
+    if (given_descrs[2] == NULL) {
+        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+        if (!loop_descrs[2]) {
+            return (NPY_CASTING)-1;
+        }
+    }
+    else {
+        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
+        if (descr_out->backend != target_backend) {
+            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[2]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[2]);
+            loop_descrs[2] = given_descrs[2];
+        }
+    }
+    return casting;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_binop_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
+                                          npy_intp const dimensions[], npy_intp const strides[],
+                                          NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2, out;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        if (backend == BACKEND_SLEEF) {
+            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+        }
+        memcpy(out_ptr, &out, elem_size);
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+quad_generic_binop_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                        npy_intp const dimensions[], npy_intp const strides[],
+                                        NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+
+    while (N--) {
+        if (backend == BACKEND_SLEEF) {
+            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
+        }
+        else {
+            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+        }
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+static int
+quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
+                    PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
+{
+    int nin = ufunc->nin;
+    int nargs = ufunc->nargs;
+    PyArray_DTypeMeta *common = NULL;
+    bool has_quad = false;
+
+    // Handle the special case for reductions
+    if (op_dtypes[0] == NULL) {
+        assert(nin == 2 && ufunc->nout == 1); /* must be reduction */
+        for (int i = 0; i < 3; i++) {
+            Py_INCREF(op_dtypes[1]);
+            new_op_dtypes[i] = op_dtypes[1];
+        }
+        return 0;
+    }
+
+    // Check if any input or signature is QuadPrecision
+    for (int i = 0; i < nin; i++) {
+        if (op_dtypes[i] == &QuadPrecDType) {
+            has_quad = true;
+        }
+    }
+
+    if (has_quad) {
+        common = &QuadPrecDType;
+    }
+    else {
+        for (int i = nin; i < nargs; i++) {
+            if (signature[i] != NULL) {
+                if (common == NULL) {
+                    Py_INCREF(signature[i]);
+                    common = signature[i];
+                }
+                else if (common != signature[i]) {
+                    Py_CLEAR(common);  // Not homogeneous, unset common
+                    break;
+                }
+            }
+        }
+    }
+    // If no common output dtype, use standard promotion for inputs
+    if (common == NULL) {
+        common = PyArray_PromoteDTypeSequence(nin, op_dtypes);
+        if (common == NULL) {
+            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                PyErr_Clear();  // Do not propagate normal promotion errors
+            }
+
+            return -1;
+        }
+    }
+
+    // Set all new_op_dtypes to the common dtype
+    for (int i = 0; i < nargs; i++) {
+        if (signature[i]) {
+            // If signature is specified for this argument, use it
+            Py_INCREF(signature[i]);
+            new_op_dtypes[i] = signature[i];
+        }
+        else {
+            // Otherwise, use the common dtype
+            Py_INCREF(common);
+
+            new_op_dtypes[i] = common;
+        }
+    }
+
+    Py_XDECREF(common);
+
+    return 0;
+}
+
+template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+int
+create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_binop_strided_loop_aligned<sleef_op, longdouble_op>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_binop_strided_loop_unaligned<sleef_op, longdouble_op>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_binop",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_NO_CASTING,
+            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
+    if (DTypes == 0) {
+        Py_DECREF(promoter_capsule);
+        return -1;
+    }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(DTypes);
+        return -1;
+    }
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(DTypes);
+    return 0;
+}
+
+int
+init_quad_binary_ops(PyObject *numpy)
+{
+    if (create_quad_binary_ufunc<quad_add, ld_add>(numpy, "add") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_sub, ld_sub>(numpy, "subtract") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_mul, ld_mul>(numpy, "multiply") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_div, ld_div>(numpy, "divide") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_pow, ld_pow>(numpy, "power") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_mod, ld_mod>(numpy, "mod") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_minimum, ld_minimum>(numpy, "minimum") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_maximum, ld_maximum>(numpy, "maximum") < 0) {
+        return -1;
+    }
+    if (create_quad_binary_ufunc<quad_atan2, ld_atan2>(numpy, "arctan2") < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+// comparison functions
+
+static NPY_CASTING
+quad_comparison_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
+                                       PyArray_Descr *const given_descrs[],
+                                       PyArray_Descr *loop_descrs[],
+                                       npy_intp *NPY_UNUSED(view_offset))
+{
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
+
+    // As dealing with different backends then cast to boolean
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
+
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
+        }
+    }
+
+    // Set up output descriptor
+    loop_descrs[2] = PyArray_DescrFromType(NPY_BOOL);
+    if (!loop_descrs[2]) {
+        return (NPY_CASTING)-1;
+    }
+    return casting;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                               npy_intp const dimensions[], npy_intp const strides[],
+                               NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+
+    quad_value in1, in2;
+    while (N--) {
+        memcpy(&in1, in1_ptr, elem_size);
+        memcpy(&in2, in2_ptr, elem_size);
+        npy_bool result;
+
+        if (backend == BACKEND_SLEEF) {
+            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
+        }
+
+        memcpy(out_ptr, &result, sizeof(npy_bool));
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+quad_generic_comp_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
+                                       npy_intp const dimensions[], npy_intp const strides[],
+                                       NpyAuxData *auxdata)
+{
+    npy_intp N = dimensions[0];
+    char *in1_ptr = data[0], *in2_ptr = data[1];
+    char *out_ptr = data[2];
+    npy_intp in1_stride = strides[0];
+    npy_intp in2_stride = strides[1];
+    npy_intp out_stride = strides[2];
+
+    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
+    QuadBackendType backend = descr->backend;
+    while (N--) {
+        quad_value in1 = *(quad_value *)in1_ptr;
+        quad_value in2 = *(quad_value *)in2_ptr;
+
+        npy_bool result;
+
+        if (backend == BACKEND_SLEEF) {
+            result = sleef_comp(&in1.sleef_value, &in2.sleef_value);
+        }
+        else {
+            result = ld_comp(&in1.longdouble_value, &in2.longdouble_value);
+        }
+
+        *(npy_bool *)out_ptr = result;
+
+        in1_ptr += in1_stride;
+        in2_ptr += in2_stride;
+        out_ptr += out_stride;
+    }
+    return 0;
+}
+
+NPY_NO_EXPORT int
+comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
+                          PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
+{
+    PyArray_DTypeMeta *new_signature[NPY_MAXARGS];
+    memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *));
+    new_signature[2] = NULL;
+    int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes);
+    if (res < 0) {
+        return -1;
+    }
+    Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType);
+    return 0;
+}
+
+template <cmp_quad_def sleef_comp, cmp_londouble_def ld_comp>
+int
+create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name)
+{
+    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+
+    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType};
+
+    PyType_Slot slots[] = {
+            {NPY_METH_resolve_descriptors, (void *)&quad_comparison_op_resolve_descriptors},
+            {NPY_METH_strided_loop,
+             (void *)&quad_generic_comp_strided_loop_aligned<sleef_comp, ld_comp>},
+            {NPY_METH_unaligned_strided_loop,
+             (void *)&quad_generic_comp_strided_loop<sleef_comp, ld_comp>},
+            {0, NULL}};
+
+    PyArrayMethod_Spec Spec = {
+            .name = "quad_comp",
+            .nin = 2,
+            .nout = 1,
+            .casting = NPY_SAFE_CASTING,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        return -1;
+    }
+
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType);
+    if (DTypes == 0) {
+        Py_DECREF(promoter_capsule);
+        return -1;
+    }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(DTypes);
+        return -1;
+    }
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(DTypes);
+
+    return 0;
+}
+
+int
+init_quad_comps(PyObject *numpy)
+{
+    if (create_quad_comparison_ufunc<quad_equal, ld_equal>(numpy, "equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_notequal, ld_notequal>(numpy, "not_equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_less, ld_less>(numpy, "less") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_lessequal, ld_lessequal>(numpy, "less_equal") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_greater, ld_greater>(numpy, "greater") < 0) {
+        return -1;
+    }
+    if (create_quad_comparison_ufunc<quad_greaterequal, ld_greaterequal>(numpy, "greater_equal") <
+        0) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int
+init_quad_umath(void)
+{
+    PyObject *numpy = PyImport_ImportModule("numpy");
+    if (!numpy) {
+        PyErr_SetString(PyExc_ImportError, "Failed to import numpy module");
+        return -1;
+    }
+
+    if (init_quad_unary_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad unary operations");
+        goto err;
+    }
+
+    if (init_quad_binary_ops(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad binary operations");
+        goto err;
+    }
+
+    if (init_quad_comps(numpy) < 0) {
+        PyErr_SetString(PyExc_RuntimeError, "Failed to initialize quad comparison operations");
+        goto err;
+    }
+
+    Py_DECREF(numpy);
+    return 0;
+
+err:
+    Py_DECREF(numpy);
+    return -1;
+}
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/umath.h b/quaddtype/numpy_quaddtype/src/umath.h
similarity index 100%
rename from quaddtype/quaddtype/src/umath.h
rename to quaddtype/numpy_quaddtype/src/umath.h
diff --git a/quaddtype/pyproject.toml b/quaddtype/pyproject.toml
index 68b61e7f..836b73f2 100644
--- a/quaddtype/pyproject.toml
+++ b/quaddtype/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 build-backend = "mesonpy"
 
 [project]
-name = "quaddtype"
+name = "numpy_quaddtype"
 description = "Quad (128-bit) float dtype for numpy"
 version = "0.0.1"
 readme = 'README.md'
@@ -22,4 +22,4 @@ dependencies = [
 [project.optional-dependencies]
 test = [
     "pytest",
-]
+]
\ No newline at end of file
diff --git a/quaddtype/quaddtype/__init__.py b/quaddtype/quaddtype/__init__.py
deleted file mode 100644
index 9f246f73..00000000
--- a/quaddtype/quaddtype/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from ._quaddtype_main import QuadPrecDType, QuadPrecision
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/casts.cpp b/quaddtype/quaddtype/src/casts.cpp
deleted file mode 100644
index c096fe5c..00000000
--- a/quaddtype/quaddtype/src/casts.cpp
+++ /dev/null
@@ -1,468 +0,0 @@
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-#define NO_IMPORT_UFUNC
-
-extern "C" {
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/dtype_api.h"
-}
-#include "sleef.h"
-#include "sleefquad.h"
-
-#include "scalar.h"
-#include "casts.h"
-#include "dtype.h"
-
-#define NUM_CASTS 29  // 14 to_casts + 14 from_casts + 1 quad_to_quad
-
-static NPY_CASTING
-quad_to_quad_resolve_descriptors(PyObject *NPY_UNUSED(self),
-                                 PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
-                                 QuadPrecDTypeObject *given_descrs[2],
-                                 QuadPrecDTypeObject *loop_descrs[2], npy_intp *view_offset)
-{
-    Py_INCREF(given_descrs[0]);
-    loop_descrs[0] = given_descrs[0];
-
-    if (given_descrs[1] == NULL) {
-        Py_INCREF(given_descrs[0]);
-        loop_descrs[1] = given_descrs[0];
-    }
-    else {
-        Py_INCREF(given_descrs[1]);
-        loop_descrs[1] = given_descrs[1];
-    }
-
-    *view_offset = 0;
-    return NPY_NO_CASTING;
-}
-
-static int
-quad_to_quad_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                          npy_intp const dimensions[], npy_intp const strides[],
-                          void *NPY_UNUSED(auxdata))
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-
-    npy_intp in_stride = strides[0];
-    npy_intp out_stride = strides[1];
-
-    while (N--) {
-        memcpy(out_ptr, in_ptr, sizeof(Sleef_quad));
-        in_ptr += in_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-// Casting from other types to QuadDType
-
-template <typename T>
-static inline Sleef_quad
-to_quad(T x);
-
-template <>
-inline Sleef_quad
-to_quad<npy_bool>(npy_bool x)
-{
-    return x ? Sleef_cast_from_doubleq1(1.0) : Sleef_cast_from_doubleq1(0.0);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_byte>(npy_byte x)
-{
-    return Sleef_cast_from_int64q1(x);
-}
-// template <>
-// inline Sleef_quad
-// to_quad<npy_ubyte>(npy_ubyte x)
-// {
-//     return Sleef_cast_from_uint64q1(x);
-// }
-template <>
-inline Sleef_quad
-to_quad<npy_short>(npy_short x)
-{
-    return Sleef_cast_from_int64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_ushort>(npy_ushort x)
-{
-    return Sleef_cast_from_uint64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_int>(npy_int x)
-{
-    return Sleef_cast_from_int64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_uint>(npy_uint x)
-{
-    return Sleef_cast_from_uint64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_long>(npy_long x)
-{
-    return Sleef_cast_from_int64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_ulong>(npy_ulong x)
-{
-    return Sleef_cast_from_uint64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_longlong>(npy_longlong x)
-{
-    return Sleef_cast_from_int64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<npy_ulonglong>(npy_ulonglong x)
-{
-    return Sleef_cast_from_uint64q1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<float>(float x)
-{
-    return Sleef_cast_from_doubleq1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<double>(double x)
-{
-    return Sleef_cast_from_doubleq1(x);
-}
-template <>
-inline Sleef_quad
-to_quad<long double>(long double x)
-{
-    return Sleef_cast_from_doubleq1(x);
-}
-
-template <typename T>
-static NPY_CASTING
-numpy_to_quad_resolve_descriptors(PyObject *NPY_UNUSED(self), PyArray_DTypeMeta *dtypes[2],
-                                  PyArray_Descr *given_descrs[2], PyArray_Descr *loop_descrs[2],
-                                  npy_intp *view_offset)
-{
-    if (given_descrs[1] == NULL) {
-        loop_descrs[1] = (PyArray_Descr *)new_quaddtype_instance();
-        if (loop_descrs[1] == nullptr) {
-            return (NPY_CASTING)-1;
-        }
-    }
-    else {
-        Py_INCREF(given_descrs[1]);
-        loop_descrs[1] = given_descrs[1];
-    }
-
-    loop_descrs[0] = PyArray_GetDefaultDescr(dtypes[0]);
-    // *view_offset = 0;
-    return NPY_SAFE_CASTING;
-}
-
-template <typename T>
-static int
-numpy_to_quad_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                           npy_intp const dimensions[], npy_intp const strides[],
-                           void *NPY_UNUSED(auxdata))
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-
-    while (N--) {
-        T in_val;
-        Sleef_quad out_val;
-
-        memcpy(&in_val, in_ptr, sizeof(T));
-        out_val = to_quad<T>(in_val);
-        memcpy(out_ptr, &out_val, sizeof(Sleef_quad));
-
-        in_ptr += strides[0];
-        out_ptr += strides[1];
-    }
-    return 0;
-}
-
-// Casting from QuadDType to other types
-
-template <typename T>
-static inline T
-from_quad(Sleef_quad x);
-
-template <>
-inline npy_bool
-from_quad<npy_bool>(Sleef_quad x)
-{
-    return Sleef_cast_to_int64q1(x) != 0;
-}
-template <>
-inline npy_byte
-from_quad<npy_byte>(Sleef_quad x)
-{
-    return (npy_byte)Sleef_cast_to_int64q1(x);
-}
-// template <>
-// inline npy_ubyte
-// from_quad<npy_ubyte>(Sleef_quad x)
-// {
-//     return (npy_ubyte)Sleef_cast_to_uint64q1(x);
-// }
-template <>
-inline npy_short
-from_quad<npy_short>(Sleef_quad x)
-{
-    return (npy_short)Sleef_cast_to_int64q1(x);
-}
-template <>
-inline npy_ushort
-from_quad<npy_ushort>(Sleef_quad x)
-{
-    return (npy_ushort)Sleef_cast_to_uint64q1(x);
-}
-template <>
-inline npy_int
-from_quad<npy_int>(Sleef_quad x)
-{
-    return (npy_int)Sleef_cast_to_int64q1(x);
-}
-template <>
-inline npy_uint
-from_quad<npy_uint>(Sleef_quad x)
-{
-    return (npy_uint)Sleef_cast_to_uint64q1(x);
-}
-template <>
-inline npy_long
-from_quad<npy_long>(Sleef_quad x)
-{
-    return (npy_long)Sleef_cast_to_int64q1(x);
-}
-template <>
-inline npy_ulong
-from_quad<npy_ulong>(Sleef_quad x)
-{
-    return (npy_ulong)Sleef_cast_to_uint64q1(x);
-}
-template <>
-inline npy_longlong
-from_quad<npy_longlong>(Sleef_quad x)
-{
-    return Sleef_cast_to_int64q1(x);
-}
-template <>
-inline npy_ulonglong
-from_quad<npy_ulonglong>(Sleef_quad x)
-{
-    return Sleef_cast_to_uint64q1(x);
-}
-template <>
-inline float
-from_quad<float>(Sleef_quad x)
-{
-    return Sleef_cast_to_doubleq1(x);
-}
-template <>
-inline double
-from_quad<double>(Sleef_quad x)
-{
-    return Sleef_cast_to_doubleq1(x);
-}
-template <>
-inline long double
-from_quad<long double>(Sleef_quad x)
-{
-    return Sleef_cast_to_doubleq1(x);
-}
-
-template <typename T>
-static NPY_CASTING
-quad_to_numpy_resolve_descriptors(PyObject *NPY_UNUSED(self), PyArray_DTypeMeta *dtypes[2],
-                                  PyArray_Descr *given_descrs[2], PyArray_Descr *loop_descrs[2],
-                                  npy_intp *view_offset)
-{
-    Py_INCREF(given_descrs[0]);
-    loop_descrs[0] = given_descrs[0];
-
-    loop_descrs[1] = PyArray_GetDefaultDescr(dtypes[1]);
-    // *view_offset = 0;
-    return NPY_UNSAFE_CASTING;
-}
-
-template <typename T>
-static int
-quad_to_numpy_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                           npy_intp const dimensions[], npy_intp const strides[],
-                           void *NPY_UNUSED(auxdata))
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-
-    while (N--) {
-        Sleef_quad in_val = *(Sleef_quad *)in_ptr;
-        T *out_val = (T *)out_ptr;
-        *out_val = from_quad<T>(in_val);
-
-        in_ptr += strides[0];
-        out_ptr += strides[1];
-    }
-    return 0;
-}
-
-static PyArrayMethod_Spec *specs[NUM_CASTS + 1];  // +1 for NULL terminator
-static size_t spec_count = 0;
-
-void
-add_spec(PyArrayMethod_Spec *spec)
-{
-    if (spec_count < NUM_CASTS) {
-        specs[spec_count++] = spec;
-    }
-}
-
-// functions to add casts
-template <typename T>
-void
-add_cast_from(PyArray_DTypeMeta *to)
-{
-    PyArray_DTypeMeta **dtypes = new PyArray_DTypeMeta *[2]{nullptr, to};
-
-    PyType_Slot *slots = new PyType_Slot[3]{
-            {NPY_METH_resolve_descriptors, (void *)&quad_to_numpy_resolve_descriptors<T>},
-            {NPY_METH_strided_loop, (void *)&quad_to_numpy_strided_loop<T>},
-            {0, nullptr}};
-
-    PyArrayMethod_Spec *spec = new PyArrayMethod_Spec{
-            .name = "cast_QuadPrec_to_NumPy",
-            .nin = 1,
-            .nout = 1,
-            .casting = NPY_UNSAFE_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)0,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-    add_spec(spec);
-}
-
-template <typename T>
-void
-add_cast_to(PyArray_DTypeMeta *from)
-{
-    PyArray_DTypeMeta **dtypes = new PyArray_DTypeMeta *[2]{from, nullptr};
-
-    PyType_Slot *slots = new PyType_Slot[3]{
-            {NPY_METH_resolve_descriptors, (void *)&numpy_to_quad_resolve_descriptors<T>},
-            {NPY_METH_strided_loop, (void *)&numpy_to_quad_strided_loop<T>},
-            {0, nullptr}};
-
-    PyArrayMethod_Spec *spec = new PyArrayMethod_Spec{
-            .name = "cast_NumPy_to_QuadPrec",
-            .nin = 1,
-            .nout = 1,
-            .casting = NPY_SAFE_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)0,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    add_spec(spec);
-}
-
-PyArrayMethod_Spec **
-init_casts_internal(void)
-{
-    PyArray_DTypeMeta **quad2quad_dtypes = new PyArray_DTypeMeta *[2]{nullptr, nullptr};
-    PyType_Slot *quad2quad_slots = new PyType_Slot[4]{
-            {NPY_METH_resolve_descriptors, (void *)&quad_to_quad_resolve_descriptors},
-            {NPY_METH_strided_loop, (void *)&quad_to_quad_strided_loop},
-            {NPY_METH_unaligned_strided_loop, (void *)&quad_to_quad_strided_loop},
-            {0, nullptr}};
-
-    PyArrayMethod_Spec *quad2quad_spec = new PyArrayMethod_Spec{
-            .name = "cast_QuadPrec_to_QuadPrec",
-            .nin = 1,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = NPY_METH_SUPPORTS_UNALIGNED,
-            .dtypes = quad2quad_dtypes,
-            .slots = quad2quad_slots,
-    };
-
-    add_spec(quad2quad_spec);
-
-    add_cast_to<npy_bool>(&PyArray_BoolDType);
-    add_cast_to<npy_byte>(&PyArray_ByteDType);
-    add_cast_to<npy_ubyte>(&PyArray_UByteDType);
-    add_cast_to<npy_short>(&PyArray_ShortDType);
-    add_cast_to<npy_ushort>(&PyArray_UShortDType);
-    add_cast_to<npy_int>(&PyArray_IntDType);
-    add_cast_to<npy_uint>(&PyArray_UIntDType);
-    add_cast_to<npy_long>(&PyArray_LongDType);
-    add_cast_to<npy_ulong>(&PyArray_ULongDType);
-    add_cast_to<npy_longlong>(&PyArray_LongLongDType);
-    add_cast_to<npy_ulonglong>(&PyArray_ULongLongDType);
-    add_cast_to<float>(&PyArray_FloatDType);
-    add_cast_to<double>(&PyArray_DoubleDType);
-    add_cast_to<long double>(&PyArray_LongDoubleDType);
-
-    add_cast_from<npy_bool>(&PyArray_BoolDType);
-    add_cast_from<npy_byte>(&PyArray_ByteDType);
-    add_cast_from<npy_ubyte>(&PyArray_UByteDType);
-    add_cast_from<npy_short>(&PyArray_ShortDType);
-    add_cast_from<npy_ushort>(&PyArray_UShortDType);
-    add_cast_from<npy_int>(&PyArray_IntDType);
-    add_cast_from<npy_uint>(&PyArray_UIntDType);
-    add_cast_from<npy_long>(&PyArray_LongDType);
-    add_cast_from<npy_ulong>(&PyArray_ULongDType);
-    add_cast_from<npy_longlong>(&PyArray_LongLongDType);
-    add_cast_from<npy_ulonglong>(&PyArray_ULongLongDType);
-    add_cast_from<float>(&PyArray_FloatDType);
-    add_cast_from<double>(&PyArray_DoubleDType);
-    add_cast_from<long double>(&PyArray_LongDoubleDType);
-
-    specs[spec_count] = nullptr;
-    return specs;
-}
-
-PyArrayMethod_Spec **
-init_casts(void)
-{
-    try {
-        return init_casts_internal();
-    }
-    catch (int e) {
-        PyErr_NoMemory();
-        return nullptr;
-    }
-}
-
-void
-free_casts(void)
-{
-    for (auto cast : specs) {
-        if (cast == nullptr) {
-            continue;
-        }
-        delete cast->dtypes;
-        delete cast->slots;
-        delete cast;
-    }
-    spec_count = 0;
-}
diff --git a/quaddtype/quaddtype/src/dtype.c b/quaddtype/quaddtype/src/dtype.c
deleted file mode 100644
index a6f485bf..00000000
--- a/quaddtype/quaddtype/src/dtype.c
+++ /dev/null
@@ -1,216 +0,0 @@
-#include <Python.h>
-#include <sleef.h>
-#include <sleefquad.h>
-
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-#define NO_IMPORT_UFUNC
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/dtype_api.h"
-
-#include "scalar.h"
-#include "casts.h"
-#include "dtype.h"
-
-static inline int quad_load(Sleef_quad *x, char *data_ptr) 
-{
-    if (data_ptr == NULL || x == NULL) 
-    {
-        return -1;
-    }
-    *x = *(Sleef_quad *)data_ptr;
-    return 0;
-}
-
-static inline int quad_store(char *data_ptr, Sleef_quad x) 
-{
-    if (data_ptr == NULL) 
-    {
-        return -1;
-    }
-    *(Sleef_quad *)data_ptr = x;
-    return 0;
-}
-
-QuadPrecDTypeObject  * new_quaddtype_instance(void)
-{
-    QuadPrecDTypeObject *new = (QuadPrecDTypeObject *)PyArrayDescr_Type.tp_new((PyTypeObject *)&QuadPrecDType, NULL, NULL);
-    if (new == NULL) {
-        return NULL;
-    }
-    new->base.elsize = sizeof(Sleef_quad);
-    new->base.alignment = _Alignof(Sleef_quad);
-
-    return new;
-}
-
-static QuadPrecDTypeObject * ensure_canonical(QuadPrecDTypeObject *self)
-{
-    Py_INCREF(self);
-    return self;
-}
-
-static QuadPrecDTypeObject * common_instance(QuadPrecDTypeObject *dtype1, QuadPrecDTypeObject *dtype2)
-{
-    Py_INCREF(dtype1);
-    return dtype1;
-}
-
-
-static PyArray_DTypeMeta * common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
-{
-    // Promote integer and floating-point types to QuadPrecDType
-    if (other->type_num >= 0 && 
-        (PyTypeNum_ISINTEGER(other->type_num) || 
-         PyTypeNum_ISFLOAT(other->type_num))) {
-        Py_INCREF(cls);
-        return cls;
-    }
-    // Don't promote complex types
-    if (PyTypeNum_ISCOMPLEX(other->type_num)) {
-        Py_INCREF(Py_NotImplemented);
-        return (PyArray_DTypeMeta *)Py_NotImplemented;
-    }
-
-    Py_INCREF(Py_NotImplemented);
-    return (PyArray_DTypeMeta *)Py_NotImplemented;
-}
-
-static PyArray_Descr *
-quadprec_discover_descriptor_from_pyobject(PyArray_DTypeMeta *NPY_UNUSED(cls), PyObject *obj)
-{
-    if (Py_TYPE(obj) != &QuadPrecision_Type) 
-    {
-        PyErr_SetString(PyExc_TypeError, "Can only store QuadPrecision in a QuadPrecDType array.");
-        return NULL;
-    }
-    return (PyArray_Descr *)new_quaddtype_instance();
-}
-
-static int quadprec_setitem(QuadPrecDTypeObject *descr, PyObject *obj, char *dataptr)
-{
-    QuadPrecisionObject *value;
-    if (PyObject_TypeCheck(obj, &QuadPrecision_Type)) 
-    {
-        Py_INCREF(obj);
-        value = (QuadPrecisionObject *)obj;
-    }
-    else 
-    {
-        value = QuadPrecision_from_object(obj);
-        if (value == NULL) {
-            return -1;
-        }
-    }
-
-    if (quad_store(dataptr, value->quad.value) < 0)
-    {
-        Py_DECREF(value);
-        char error_msg[100];
-        snprintf(error_msg, sizeof(error_msg), "Invalid memory location %p", (void*)dataptr);
-        PyErr_SetString(PyExc_ValueError, error_msg);
-        return -1;
-    }
-
-    Py_DECREF(value);
-    return 0;
-}
-
-static PyObject * quadprec_getitem(QuadPrecDTypeObject *descr, char *dataptr)
-{
-    QuadPrecisionObject *new = QuadPrecision_raw_new();
-    if (!new) 
-    {
-        return NULL;
-    }
-    if (quad_load(&new->quad.value, dataptr) < 0) 
-    {
-        Py_DECREF(new);
-        char error_msg[100];
-        snprintf(error_msg, sizeof(error_msg), "Invalid memory location %p", (void*)dataptr);
-        PyErr_SetString(PyExc_ValueError, error_msg);
-        return NULL;
-    }
-    return (PyObject *)new;
-}
-
-static PyArray_Descr *quadprec_default_descr(PyArray_DTypeMeta *NPY_UNUSED(cls))
-{
-    return (PyArray_Descr *)new_quaddtype_instance();
-}
-
-static PyType_Slot QuadPrecDType_Slots[] = 
-{
-    {NPY_DT_ensure_canonical, &ensure_canonical},
-    {NPY_DT_common_instance, &common_instance},
-    {NPY_DT_common_dtype, &common_dtype},
-    {NPY_DT_discover_descr_from_pyobject, &quadprec_discover_descriptor_from_pyobject},
-    {NPY_DT_setitem, &quadprec_setitem},
-    {NPY_DT_getitem, &quadprec_getitem},
-    {NPY_DT_default_descr, &quadprec_default_descr},
-    {0, NULL}
-};
-
-
-static PyObject * QuadPrecDType_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwds)
-{
-    if (PyTuple_GET_SIZE(args) != 0 || (kwds != NULL && PyDict_Size(kwds) != 0)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "QuadPrecDType takes no arguments");
-        return NULL;
-    }
-
-    return (PyObject *)new_quaddtype_instance();
-}
-
-static PyObject * QuadPrecDType_repr(QuadPrecDTypeObject *self)
-{
-    return PyUnicode_FromString("QuadPrecDType()");
-}
-
-PyArray_DTypeMeta QuadPrecDType = {
-    {{
-        PyVarObject_HEAD_INIT(NULL, 0)
-        .tp_name = "QuadPrecDType.QuadPrecDType",
-        .tp_basicsize = sizeof(QuadPrecDTypeObject),
-        .tp_new = QuadPrecDType_new,
-        .tp_repr = (reprfunc)QuadPrecDType_repr,
-        .tp_str = (reprfunc)QuadPrecDType_repr,
-    }},
-};
-
-int init_quadprec_dtype(void)
-{
-    PyArrayMethod_Spec **casts = init_casts();
-    if (!casts)
-        return -1;
-
-    PyArrayDTypeMeta_Spec QuadPrecDType_DTypeSpec = {
-        .flags = NPY_DT_NUMERIC,
-        .casts = casts,
-        .typeobj = &QuadPrecision_Type,
-        .slots = QuadPrecDType_Slots,
-    };
-
-    ((PyObject *)&QuadPrecDType)->ob_type = &PyArrayDTypeMeta_Type;
-
-    ((PyTypeObject *)&QuadPrecDType)->tp_base = &PyArrayDescr_Type;
-
-    if (PyType_Ready((PyTypeObject *)&QuadPrecDType) < 0) 
-    {
-        return -1;
-    }
-
-    if (PyArrayInitDTypeMeta_FromSpec(&QuadPrecDType, &QuadPrecDType_DTypeSpec) < 0)
-    {
-        return -1;
-    }
-
-    free_casts();
-    
-    return 0;
-}
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/dtype.h b/quaddtype/quaddtype/src/dtype.h
deleted file mode 100644
index 162d5714..00000000
--- a/quaddtype/quaddtype/src/dtype.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _QUADDTYPE_DTYPE_H
-#define _QUADDTYPE_DTYPE_H
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include<Python.h>
-#include<sleef.h>
-#include<numpy/ndarraytypes.h>
-#include<numpy/dtype_api.h>
-
-#include "scalar.h"
-
-typedef struct
-{
-    PyArray_Descr base;
-    
-} QuadPrecDTypeObject;
-
-extern PyArray_DTypeMeta QuadPrecDType;
-
-QuadPrecDTypeObject * new_quaddtype_instance(void);
-
-int init_quadprec_dtype(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/ops.hpp b/quaddtype/quaddtype/src/ops.hpp
deleted file mode 100644
index 6a3511c4..00000000
--- a/quaddtype/quaddtype/src/ops.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-#include <sleef.h>
-#include <sleefquad.h>
-
-typedef int (*unary_op_def)(Sleef_quad *, Sleef_quad *);
-
-static inline int
-quad_negative(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_negq1(*op);
-    return 0;
-}
-
-static int
-quad_positive(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = *op;
-    return 0;
-}
-
-static inline int
-quad_absolute(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_fabsq1(*op);
-    return 0;
-}
-
-static inline int
-quad_rint(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_rintq1(*op);
-    return 0;
-}
-
-static inline int
-quad_trunc(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_truncq1(*op);
-    return 0;
-}
-
-static inline int
-quad_floor(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_floorq1(*op);
-    return 0;
-}
-
-static inline int
-quad_ceil(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_ceilq1(*op);
-    return 0;
-}
-
-static inline int
-quad_sqrt(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_sqrtq1_u05(*op);
-    return 0;
-}
-
-static inline int
-quad_square(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_mulq1_u05(*op, *op);
-    return 0;
-}
-
-static inline int
-quad_log(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_logq1_u10(*op);
-    return 0;
-}
-
-static inline int
-quad_log2(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_log2q1_u10(*op);
-    return 0;
-}
-
-static inline int
-quad_log10(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_log10q1_u10(*op);
-    return 0;
-}
-
-static inline int
-quad_log1p(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_log1pq1_u10(*op);
-    return 0;
-}
-
-static inline int
-quad_exp(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_expq1_u10(*op);
-    return 0;
-}
-
-static inline int
-quad_exp2(Sleef_quad *op, Sleef_quad *out)
-{
-    *out = Sleef_exp2q1_u10(*op);
-    return 0;
-}
-
-// binary ops
-typedef int (*binop_def)(Sleef_quad *, Sleef_quad *, Sleef_quad *);
-
-static inline int
-quad_add(Sleef_quad *out, Sleef_quad *in1, Sleef_quad *in2)
-{
-    *out = Sleef_addq1_u05(*in1, *in2);
-    return 0;
-}
-
-static inline int
-quad_sub(Sleef_quad *out, Sleef_quad *in1, Sleef_quad *in2)
-{
-    *out = Sleef_subq1_u05(*in1, *in2);
-    return 0;
-}
-
-static inline int
-quad_mul(Sleef_quad *res, Sleef_quad *a, Sleef_quad *b)
-{
-    *res = Sleef_mulq1_u05(*a, *b);
-    return 0;
-}
-
-static inline int
-quad_div(Sleef_quad *res, Sleef_quad *a, Sleef_quad *b)
-{
-    *res = Sleef_divq1_u05(*a, *b);
-    return 0;
-}
-
-static inline int
-quad_pow(Sleef_quad *res, Sleef_quad *a, Sleef_quad *b)
-{
-    *res = Sleef_powq1_u10(*a, *b);
-    return 0;
-}
-
-static inline int
-quad_mod(Sleef_quad *res, Sleef_quad *a, Sleef_quad *b)
-{
-    *res = Sleef_fmodq1(*a, *b);
-    return 0;
-}
-
-static inline int
-quad_minimum(Sleef_quad *out, Sleef_quad *in1, Sleef_quad *in2)
-{
-    *out = Sleef_icmpleq1(*in1, *in2) ? *in1 : *in2;
-    return 0;
-}
-
-static inline int
-quad_maximum(Sleef_quad *out, Sleef_quad *in1, Sleef_quad *in2)
-{
-    *out = Sleef_icmpgeq1(*in1, *in2) ? *in1 : *in2;
-    return 0;
-}
-
-// comparison functions
-typedef npy_bool (*cmp_def)(const Sleef_quad *, const Sleef_quad *);
-
-static inline npy_bool
-quad_equal(const Sleef_quad *a, const Sleef_quad *b)
-{
-    return Sleef_icmpeqq1(*a, *b);
-}
-
-static inline npy_bool
-quad_notequal(const Sleef_quad *a, const Sleef_quad *b)
-{
-    return Sleef_icmpneq1(*a, *b);
-}
-
-static inline npy_bool
-quad_less(const Sleef_quad *a, const Sleef_quad *b)
-{
-    return Sleef_icmpltq1(*a, *b);
-}
-
-static inline npy_bool
-quad_lessequal(const Sleef_quad *a, const Sleef_quad *b)
-{
-    return Sleef_icmpleq1(*a, *b);
-}
-
-static inline npy_bool
-quad_greater(const Sleef_quad *a, const Sleef_quad *b)
-{
-    return Sleef_icmpgtq1(*a, *b);
-}
-
-static inline npy_bool
-quad_greaterequal(const Sleef_quad *a, const Sleef_quad *b)
-{
-    return Sleef_icmpgeq1(*a, *b);
-}
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/quaddtype_main.c b/quaddtype/quaddtype/src/quaddtype_main.c
deleted file mode 100644
index 098c0749..00000000
--- a/quaddtype/quaddtype/src/quaddtype_main.c
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <Python.h>
-
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-
-#include "numpy/arrayobject.h"
-#include "numpy/dtype_api.h"
-#include "numpy/ufuncobject.h"
-
-#include "dtype.h"
-#include "umath.h"
-
-static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        .m_name = "_quaddtype_main",
-        .m_doc = "Quad (128-bit) floating point Data Type for Numpy",
-        .m_size = -1,
-};
-
-PyMODINIT_FUNC
-PyInit__quaddtype_main(void)
-{
-    import_array();
-    import_umath();
-    PyObject *m = PyModule_Create(&moduledef);
-    if (!m) 
-    {
-        return NULL;
-    }
-
-    if (init_quadprecision_scalar() < 0)
-        goto error;
-    
-    if(PyModule_AddObject(m, "QuadPrecision", (PyObject *)&QuadPrecision_Type) < 0)
-        goto error;
-
-    if(init_quadprec_dtype() < 0)
-        goto error;
-
-    if(PyModule_AddObject(m, "QuadPrecDType", (PyObject *)&QuadPrecDType) < 0)
-        goto error;
-
-    if (init_quad_umath() < 0) {
-        goto error;
-    }
-
-    return m;
-    
-
-error:
-    Py_DECREF(m);
-    return NULL;
-}
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/scalar.c b/quaddtype/quaddtype/src/scalar.c
deleted file mode 100644
index 15f727d8..00000000
--- a/quaddtype/quaddtype/src/scalar.c
+++ /dev/null
@@ -1,126 +0,0 @@
-#include<Python.h>
-#include<sleef.h>
-#include<sleefquad.h>
-
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/dtype_api.h"
-
-#include "scalar.h"
-#include "scalar_ops.h"
-
-
-// static PyTypeObject QuadPrecision_Type;
-
-
-QuadPrecisionObject * QuadPrecision_raw_new(void)
-{
-    QuadPrecisionObject * new = PyObject_New(QuadPrecisionObject, &QuadPrecision_Type);
-    if(!new)
-        return NULL;
-    new->quad.value = Sleef_cast_from_doubleq1(0.0); // initialize to 0
-    return new;
-}
-
-QuadPrecisionObject * QuadPrecision_from_object(PyObject * value)
-{
-    QuadPrecisionObject *self = QuadPrecision_raw_new();
-    if(!self)
-        return NULL;
-    
-    if(PyFloat_Check(value))
-        self->quad.value = Sleef_cast_from_doubleq1(PyFloat_AsDouble(value));
-    
-    else if(PyUnicode_CheckExact(value))
-    {
-        const char * s = PyUnicode_AsUTF8(value);
-        char *endptr = NULL;
-        self->quad.value = Sleef_strtoq(s, &endptr);
-        if (*endptr != '\0' || endptr == s)
-        {
-            PyErr_SetString(PyExc_ValueError, "Unable to parse string to QuadPrecision");
-            Py_DECREF(self);
-            return NULL;
-        }
-    }
-    else if(PyLong_Check(value))
-    {
-        long int val = PyLong_AsLong(value);
-        if (val == -1)
-        {
-            PyErr_SetString(PyExc_OverflowError, "Overflow Error, value out of range");
-            Py_DECREF(self);
-            return NULL;
-        }
-        self->quad.value = Sleef_cast_from_int64q1(val);
-    }
-    else
-    {
-        PyErr_SetString(PyExc_TypeError, "QuadPrecision value must be a float, int or string");
-        Py_DECREF(self);
-        return NULL;
-    }
-
-    return self;
-}
-
-static PyObject *
-QuadPrecision_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs)
-{
-    PyObject *value;
-
-    if (!PyArg_ParseTuple(args, "O", &value)) {
-        return NULL;
-    }
-
-    return (PyObject *)QuadPrecision_from_object(value);
-}
-
-static PyObject * QuadPrecision_str(QuadPrecisionObject * self)
-{
-    char buffer[128];
-    Sleef_snprintf(buffer, sizeof(buffer), "%.*Qe", SLEEF_QUAD_DIG, self->quad.value);
-    return PyUnicode_FromString(buffer);
-}
-
-static PyObject * QuadPrecision_repr(QuadPrecisionObject* self)
-{
-    PyObject *str = QuadPrecision_str(self);
-    if (str == NULL) {
-        return NULL;
-    }
-    PyObject *res = PyUnicode_FromFormat("QuadPrecision('%S')", str);
-    Py_DECREF(str);
-    return res;
-}
-
-static void
-quad_dealloc(QuadPrecDTypeObject *self)
-{
-    Py_TYPE(self)->tp_free((PyObject *)self);
-}
-
-PyTypeObject QuadPrecision_Type = 
-{
-    PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "QuadPrecType.QuadPrecision",
-    .tp_basicsize = sizeof(QuadPrecisionObject),
-    .tp_itemsize = 0,
-    .tp_new = QuadPrecision_new,
-    .tp_dealloc = (destructor)quad_dealloc,
-    .tp_repr = (reprfunc)QuadPrecision_repr,
-    .tp_str = (reprfunc)QuadPrecision_str,
-    .tp_as_number = &quad_as_scalar,
-    .tp_richcompare = (richcmpfunc)quad_richcompare
-
-};
-
-int
-init_quadprecision_scalar(void)
-{
-    return PyType_Ready(&QuadPrecision_Type);
-}
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/scalar.h b/quaddtype/quaddtype/src/scalar.h
deleted file mode 100644
index 962a1fec..00000000
--- a/quaddtype/quaddtype/src/scalar.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _QUADDTYPE_SCALAR_H
-#define _QUADDTYPE_SCALAR_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <Python.h>
-#include <sleef.h>
-
-typedef struct {
-    Sleef_quad value;
-} quad_field;
-
-typedef struct {
-    PyObject_HEAD
-    quad_field quad;
-} QuadPrecisionObject;
-
-extern PyTypeObject QuadPrecision_Type;
-
-QuadPrecisionObject *
-QuadPrecision_raw_new(void);
-
-QuadPrecisionObject *
-QuadPrecision_from_object(PyObject *value);
-
-int
-init_quadprecision_scalar(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/scalar_ops.cpp b/quaddtype/quaddtype/src/scalar_ops.cpp
deleted file mode 100644
index a51bc69a..00000000
--- a/quaddtype/quaddtype/src/scalar_ops.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-
-extern "C" {
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/ufuncobject.h"
-
-#include "numpy/dtype_api.h"
-}
-
-#include "scalar.h"
-#include "ops.hpp"
-#include "scalar_ops.h"
-
-template <unary_op_def unary_op>
-static PyObject *
-quad_unary_func(QuadPrecisionObject *self)
-{
-    QuadPrecisionObject *res = QuadPrecision_raw_new();
-    if (!res) {
-        return NULL;
-    }
-
-    unary_op(&self->quad.value, &res->quad.value);
-    return (PyObject *)res;
-}
-
-PyObject *
-quad_nonzero(QuadPrecisionObject *self)
-{
-    return PyBool_FromLong(Sleef_icmpneq1(self->quad.value, Sleef_cast_from_int64q1(0)));
-}
-
-template <binop_def binary_op>
-static PyObject *
-quad_binary_func(PyObject *op1, PyObject *op2)
-{
-    QuadPrecisionObject *self;
-    PyObject *other;
-    QuadPrecisionObject *other_quad = NULL;
-    int is_forward;
-
-    if (PyObject_TypeCheck(op1, &QuadPrecision_Type)) {
-        is_forward = 1;
-        self = (QuadPrecisionObject *)op1;
-        other = Py_NewRef(op2);
-    }
-    else {
-        is_forward = 0;
-        self = (QuadPrecisionObject *)op2;
-        other = Py_NewRef(op1);
-    }
-
-    if (PyObject_TypeCheck(other, &QuadPrecision_Type)) {
-        Py_INCREF(other);
-        other_quad = (QuadPrecisionObject *)other;
-    }
-    else if (PyLong_Check(other) || PyFloat_Check(other)) {
-        other_quad = QuadPrecision_raw_new();
-        if (!other_quad) {
-            Py_DECREF(other);
-            return NULL;
-        }
-
-        if (PyLong_Check(other)) {
-            long long value = PyLong_AsLongLong(other);
-            if (value == -1 && PyErr_Occurred()) {
-                Py_DECREF(other);
-                Py_DECREF(other_quad);
-                return NULL;
-            }
-            other_quad->quad.value = Sleef_cast_from_int64q1(value);
-        }
-        else {
-            double value = PyFloat_AsDouble(other);
-            if (value == -1.0 && PyErr_Occurred()) {
-                Py_DECREF(other);
-                Py_DECREF(other_quad);
-                return NULL;
-            }
-            other_quad->quad.value = Sleef_cast_from_doubleq1(value);
-        }
-    }
-    else {
-        Py_DECREF(other);
-        Py_RETURN_NOTIMPLEMENTED;
-    }
-
-    QuadPrecisionObject *res = QuadPrecision_raw_new();
-    if (!res) {
-        Py_DECREF(other_quad);
-        Py_DECREF(other);
-        return NULL;
-    }
-
-    if (is_forward) {
-        binary_op(&res->quad.value, &self->quad.value, &other_quad->quad.value);
-    }
-    else {
-        binary_op(&res->quad.value, &other_quad->quad.value, &self->quad.value);
-    }
-
-    Py_DECREF(other_quad);
-    Py_DECREF(other);
-    return (PyObject *)res;
-}
-
-// todo: add support with float and int
-PyObject *
-quad_richcompare(QuadPrecisionObject *self, PyObject *other, int cmp_op)
-{
-    QuadPrecisionObject *other_quad = NULL;
-
-    if (PyObject_TypeCheck(other, &QuadPrecision_Type)) {
-        Py_INCREF(other);
-        other_quad = (QuadPrecisionObject *)other;
-    }
-    else if (PyLong_CheckExact(other) || PyFloat_CheckExact(other)) {
-        other_quad = QuadPrecision_from_object(other);
-        if (other_quad == NULL) {
-            return NULL;
-        }
-    }
-    else {
-        Py_RETURN_NOTIMPLEMENTED;
-    }
-    int cmp;
-    switch (cmp_op) {
-        case Py_LT:
-            cmp = Sleef_icmpltq1(self->quad.value, other_quad->quad.value);
-            break;
-        case Py_LE:
-            cmp = Sleef_icmpleq1(self->quad.value, other_quad->quad.value);
-            break;
-        case Py_EQ:
-            cmp = Sleef_icmpeqq1(self->quad.value, other_quad->quad.value);
-            break;
-        case Py_NE:
-            cmp = Sleef_icmpneq1(self->quad.value, other_quad->quad.value);
-            break;
-        case Py_GT:
-            cmp = Sleef_icmpgtq1(self->quad.value, other_quad->quad.value);
-            break;
-        case Py_GE:
-            cmp = Sleef_icmpgeq1(self->quad.value, other_quad->quad.value);
-            break;
-        default:
-            Py_DECREF(other_quad);
-            Py_RETURN_NOTIMPLEMENTED;
-    }
-    Py_DECREF(other_quad);
-
-    return PyBool_FromLong(cmp);
-}
-
-PyNumberMethods quad_as_scalar = {
-        .nb_add = (binaryfunc)quad_binary_func<quad_add>,
-        .nb_subtract = (binaryfunc)quad_binary_func<quad_sub>,
-        .nb_multiply = (binaryfunc)quad_binary_func<quad_mul>,
-        .nb_power = (ternaryfunc)quad_binary_func<quad_pow>,
-        .nb_negative = (unaryfunc)quad_unary_func<quad_negative>,
-        .nb_positive = (unaryfunc)quad_unary_func<quad_positive>,
-        .nb_absolute = (unaryfunc)quad_unary_func<quad_absolute>,
-        .nb_bool = (inquiry)quad_nonzero,
-        .nb_true_divide = (binaryfunc)quad_binary_func<quad_div>,
-};
\ No newline at end of file
diff --git a/quaddtype/quaddtype/src/umath.cpp b/quaddtype/quaddtype/src/umath.cpp
deleted file mode 100644
index 9007f890..00000000
--- a/quaddtype/quaddtype/src/umath.cpp
+++ /dev/null
@@ -1,576 +0,0 @@
-#include "scalar.h"
-
-#define PY_ARRAY_UNIQUE_SYMBOL QuadPrecType_ARRAY_API
-#define PY_UFUNC_UNIQUE_SYMBOL QuadPrecType_UFUNC_API
-#define NPY_NO_DEPRECATED_API NPY_2_0_API_VERSION
-#define NPY_TARGET_VERSION NPY_2_0_API_VERSION
-#define NO_IMPORT_ARRAY
-#define NO_IMPORT_UFUNC
-
-extern "C" {
-#include <Python.h>
-#include <cstdio>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/ufuncobject.h"
-
-#include "numpy/dtype_api.h"
-}
-#include "dtype.h"
-#include "umath.h"
-#include "ops.hpp"
-
-template <unary_op_def unary_op>
-int
-quad_generic_unary_op_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                                   npy_intp const dimensions[], npy_intp const strides[],
-                                   NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in_ptr = data[0];
-    char *out_ptr = data[1];
-    npy_intp in_stride = strides[0];
-    npy_intp out_stride = strides[1];
-
-    Sleef_quad in, out;
-    while (N--) {
-        memcpy(&in, in_ptr, sizeof(Sleef_quad));
-        unary_op(&in, &out);
-        memcpy(out_ptr, &out, sizeof(Sleef_quad));
-
-        in_ptr += in_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-static NPY_CASTING
-quad_unary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                  PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
-                                  npy_intp *NPY_UNUSED(view_offset))
-{
-    Py_INCREF(given_descrs[0]);
-    loop_descrs[0] = given_descrs[0];
-
-    if (given_descrs[1] == NULL) {
-        Py_INCREF(given_descrs[0]);
-        loop_descrs[1] = given_descrs[0];
-        return NPY_NO_CASTING;
-    }
-    Py_INCREF(given_descrs[1]);
-    loop_descrs[1] = given_descrs[1];
-
-    return NPY_NO_CASTING;
-}
-
-template <unary_op_def unary_op>
-int
-create_quad_unary_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[2] = {&QuadPrecDType, &QuadPrecDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_unary_op_resolve_descriptors},
-            {NPY_METH_strided_loop, (void *)&quad_generic_unary_op_strided_loop<unary_op>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_unary_op",
-            .nin = 1,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)0,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    return 0;
-}
-
-int
-init_quad_unary_ops(PyObject *numpy)
-{
-    if (create_quad_unary_ufunc<quad_negative>(numpy, "negative") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_absolute>(numpy, "absolute") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_rint>(numpy, "rint") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_trunc>(numpy, "trunc") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_floor>(numpy, "floor") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_ceil>(numpy, "ceil") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_sqrt>(numpy, "sqrt") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_square>(numpy, "square") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log>(numpy, "log") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log2>(numpy, "log2") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log10>(numpy, "log10") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_log1p>(numpy, "log1p") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_exp>(numpy, "exp") < 0) {
-        return -1;
-    }
-    if (create_quad_unary_ufunc<quad_exp2>(numpy, "exp2") < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-// Binary ufuncs
-
-template <binop_def binop>
-int
-quad_generic_binop_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                                npy_intp const dimensions[], npy_intp const strides[],
-                                NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    Sleef_quad in1, in2, out;
-    while (N--) {
-        memcpy(&in1, in1_ptr, sizeof(Sleef_quad));
-        memcpy(&in2, in2_ptr, sizeof(Sleef_quad));
-        binop(&out, &in1, &in2);
-        memcpy(out_ptr, &out, sizeof(Sleef_quad));
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-static NPY_CASTING
-quad_binary_op_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
-                                   PyArray_Descr *const given_descrs[],
-                                   PyArray_Descr *loop_descrs[], npy_intp *NPY_UNUSED(view_offset))
-{
-    Py_INCREF(given_descrs[0]);
-    loop_descrs[0] = given_descrs[0];
-    Py_INCREF(given_descrs[1]);
-    loop_descrs[1] = given_descrs[1];
-
-    if (given_descrs[2] == NULL) {
-        PyArray_Descr *out_descr = (PyArray_Descr *)new_quaddtype_instance();
-        if (!out_descr) {
-            return (NPY_CASTING)-1;
-        }
-        Py_INCREF(given_descrs[0]);
-        loop_descrs[2] = out_descr;
-    }
-    else {
-        Py_INCREF(given_descrs[2]);
-        loop_descrs[2] = given_descrs[2];
-    }
-
-    return NPY_NO_CASTING;
-}
-
-// helper debugging function
-static const char *
-get_dtype_name(PyArray_DTypeMeta *dtype)
-{
-    if (dtype == &QuadPrecDType) {
-        return "QuadPrecDType";
-    }
-    else if (dtype == &PyArray_BoolDType) {
-        return "BoolDType";
-    }
-    else if (dtype == &PyArray_ByteDType) {
-        return "ByteDType";
-    }
-    else if (dtype == &PyArray_UByteDType) {
-        return "UByteDType";
-    }
-    else if (dtype == &PyArray_ShortDType) {
-        return "ShortDType";
-    }
-    else if (dtype == &PyArray_UShortDType) {
-        return "UShortDType";
-    }
-    else if (dtype == &PyArray_IntDType) {
-        return "IntDType";
-    }
-    else if (dtype == &PyArray_UIntDType) {
-        return "UIntDType";
-    }
-    else if (dtype == &PyArray_LongDType) {
-        return "LongDType";
-    }
-    else if (dtype == &PyArray_ULongDType) {
-        return "ULongDType";
-    }
-    else if (dtype == &PyArray_LongLongDType) {
-        return "LongLongDType";
-    }
-    else if (dtype == &PyArray_ULongLongDType) {
-        return "ULongLongDType";
-    }
-    else if (dtype == &PyArray_FloatDType) {
-        return "FloatDType";
-    }
-    else if (dtype == &PyArray_DoubleDType) {
-        return "DoubleDType";
-    }
-    else if (dtype == &PyArray_LongDoubleDType) {
-        return "LongDoubleDType";
-    }
-    else {
-        return "UnknownDType";
-    }
-}
-
-static int
-quad_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
-                    PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
-{
-    // printf("quad_ufunc_promoter called for ufunc: %s\n", ufunc->name);
-    // printf("Entering quad_ufunc_promoter\n");
-    // printf("Ufunc name: %s\n", ufunc->name);
-    // printf("nin: %d, nargs: %d\n", ufunc->nin, ufunc->nargs);
-
-    int nin = ufunc->nin;
-    int nargs = ufunc->nargs;
-    PyArray_DTypeMeta *common = NULL;
-    bool has_quad = false;
-
-    // Handle the special case for reductions
-    if (op_dtypes[0] == NULL) {
-        assert(nin == 2 && ufunc->nout == 1); /* must be reduction */
-        for (int i = 0; i < 3; i++) {
-            Py_INCREF(op_dtypes[1]);
-            new_op_dtypes[i] = op_dtypes[1];
-            // printf("new_op_dtypes[%d] set to %s\n", i, get_dtype_name(new_op_dtypes[i]));
-        }
-        return 0;
-    }
-
-    // Check if any input or signature is QuadPrecision
-    for (int i = 0; i < nargs; i++) {
-        if ((i < nin && op_dtypes[i] == &QuadPrecDType) || (signature[i] == &QuadPrecDType)) {
-            has_quad = true;
-            // printf("QuadPrecision detected in input %d or signature\n", i);
-            break;
-        }
-    }
-
-    if (has_quad) {
-        // If QuadPrecision is involved, use it for all arguments
-        common = &QuadPrecDType;
-        // printf("Using QuadPrecDType as common type\n");
-    }
-    else {
-        // Check if output signature is homogeneous
-        for (int i = nin; i < nargs; i++) {
-            if (signature[i] != NULL) {
-                if (common == NULL) {
-                    Py_INCREF(signature[i]);
-                    common = signature[i];
-                    // printf("Common type set to %s from signature\n", get_dtype_name(common));
-                }
-                else if (common != signature[i]) {
-                    Py_CLEAR(common);  // Not homogeneous, unset common
-                    // printf("Output signature not homogeneous, cleared common type\n");
-                    break;
-                }
-            }
-        }
-
-        // If no common output dtype, use standard promotion for inputs
-        if (common == NULL) {
-            // printf("Using standard promotion for inputs\n");
-            common = PyArray_PromoteDTypeSequence(nin, op_dtypes);
-            if (common == NULL) {
-                if (PyErr_ExceptionMatches(PyExc_TypeError)) {
-                    PyErr_Clear();  // Do not propagate normal promotion errors
-                }
-                // printf("Exiting quad_ufunc_promoter (promotion failed)\n");
-                return -1;
-            }
-            // printf("Common type after promotion: %s\n", get_dtype_name(common));
-        }
-    }
-
-    // Set all new_op_dtypes to the common dtype
-    for (int i = 0; i < nargs; i++) {
-        if (signature[i]) {
-            // If signature is specified for this argument, use it
-            Py_INCREF(signature[i]);
-            new_op_dtypes[i] = signature[i];
-            // printf("new_op_dtypes[%d] set to %s (from signature)\n", i,
-            // get_dtype_name(new_op_dtypes[i]));
-        }
-        else {
-            // Otherwise, use the common dtype
-            Py_INCREF(common);
-            new_op_dtypes[i] = common;
-            // printf("new_op_dtypes[%d] set to %s (from common)\n", i,
-            // get_dtype_name(new_op_dtypes[i]));
-        }
-    }
-
-    Py_XDECREF(common);
-    // printf("Exiting quad_ufunc_promoter\n");
-    return 0;
-}
-
-template <binop_def binop>
-int
-create_quad_binary_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        Py_DecRef(ufunc);
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
-
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_binary_op_resolve_descriptors},
-            {NPY_METH_strided_loop, (void *)&quad_generic_binop_strided_loop<binop>},
-            {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_binop",
-            .nin = 2,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)0,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    PyObject *promoter_capsule =
-            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
-    if (promoter_capsule == NULL) {
-        return -1;
-    }
-
-    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
-    if (DTypes == 0) {
-        Py_DECREF(promoter_capsule);
-        return -1;
-    }
-
-    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        Py_DECREF(promoter_capsule);
-        Py_DECREF(DTypes);
-        return -1;
-    }
-    Py_DECREF(promoter_capsule);
-    Py_DECREF(DTypes);
-    return 0;
-}
-
-int
-init_quad_binary_ops(PyObject *numpy)
-{
-    if (create_quad_binary_ufunc<quad_add>(numpy, "add") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_sub>(numpy, "subtract") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_mul>(numpy, "multiply") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_div>(numpy, "divide") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_pow>(numpy, "power") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_mod>(numpy, "mod") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_minimum>(numpy, "minimum") < 0) {
-        return -1;
-    }
-    if (create_quad_binary_ufunc<quad_maximum>(numpy, "maximum") < 0) {
-        return -1;
-    }
-    return 0;
-}
-
-// comparison functions
-
-template <cmp_def comp>
-int
-quad_generic_comp_strided_loop(PyArrayMethod_Context *context, char *const data[],
-                               npy_intp const dimensions[], npy_intp const strides[],
-                               NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    while (N--) {
-        *((npy_bool *)out_ptr) = comp((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
-    }
-    return 0;
-}
-
-NPY_NO_EXPORT int
-comparison_ufunc_promoter(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtypes[],
-                          PyArray_DTypeMeta *signature[], PyArray_DTypeMeta *new_op_dtypes[])
-{
-    PyArray_DTypeMeta *new_signature[NPY_MAXARGS];
-
-    memcpy(new_signature, signature, 3 * sizeof(PyArray_DTypeMeta *));
-    new_signature[2] = NULL;
-    int res = quad_ufunc_promoter(ufunc, op_dtypes, new_signature, new_op_dtypes);
-    if (res < 0) {
-        return -1;
-    }
-    Py_XSETREF(new_op_dtypes[2], &PyArray_BoolDType);
-    return 0;
-}
-
-template <cmp_def comp>
-int
-create_quad_comparison_ufunc(PyObject *numpy, const char *ufunc_name)
-{
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
-    if (ufunc == NULL) {
-        return -1;
-    }
-
-    PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &PyArray_BoolDType};
-
-    PyType_Slot slots[] = {{NPY_METH_strided_loop, (void *)&quad_generic_comp_strided_loop<comp>},
-                           {0, NULL}};
-
-    PyArrayMethod_Spec Spec = {
-            .name = "quad_comp",
-            .nin = 2,
-            .nout = 1,
-            .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)0,
-            .dtypes = dtypes,
-            .slots = slots,
-    };
-
-    if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
-        return -1;
-    }
-
-    PyObject *promoter_capsule =
-            PyCapsule_New((void *)&comparison_ufunc_promoter, "numpy._ufunc_promoter", NULL);
-    if (promoter_capsule == NULL) {
-        return -1;
-    }
-
-    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArray_BoolDType);
-    if (DTypes == 0) {
-        Py_DECREF(promoter_capsule);
-        return -1;
-    }
-
-    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
-        Py_DECREF(promoter_capsule);
-        Py_DECREF(DTypes);
-        return -1;
-    }
-    Py_DECREF(promoter_capsule);
-    Py_DECREF(DTypes);
-
-    return 0;
-}
-
-int
-init_quad_comps(PyObject *numpy)
-{
-    if (create_quad_comparison_ufunc<quad_equal>(numpy, "equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_notequal>(numpy, "not_equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_less>(numpy, "less") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_lessequal>(numpy, "less_equal") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_greater>(numpy, "greater") < 0) {
-        return -1;
-    }
-    if (create_quad_comparison_ufunc<quad_greaterequal>(numpy, "greater_equal") < 0) {
-        return -1;
-    }
-
-    return 0;
-}
-
-int
-init_quad_umath(void)
-{
-    PyObject *numpy = PyImport_ImportModule("numpy");
-    if (!numpy)
-        return -1;
-
-    if (init_quad_unary_ops(numpy) < 0) {
-        goto err;
-    }
-
-    if (init_quad_binary_ops(numpy) < 0) {
-        goto err;
-    }
-
-    if (init_quad_comps(numpy) < 0) {
-        goto err;
-    }
-
-    Py_DECREF(numpy);
-    return 0;
-
-err:
-    Py_DECREF(numpy);
-    return -1;
-}
\ No newline at end of file
diff --git a/quaddtype/reinstall.sh b/quaddtype/reinstall.sh
index 31479511..5131f15a 100755
--- a/quaddtype/reinstall.sh
+++ b/quaddtype/reinstall.sh
@@ -8,6 +8,6 @@ then
 fi
 
 #meson setup build -Db_sanitize=address,undefined
-python -m pip uninstall -y quaddtype
-python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
-#python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v'
\ No newline at end of file
+python -m pip uninstall -y numpy_quaddtype
+# python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v' -Csetup-args="-Dbuildtype=debug"
+python -m pip install . -v --no-build-isolation -Cbuilddir=build -C'compile-args=-v'
\ No newline at end of file
diff --git a/quaddtype/tests/test_quaddtype.py b/quaddtype/tests/test_quaddtype.py
index 20ccd91c..11cf7671 100644
--- a/quaddtype/tests/test_quaddtype.py
+++ b/quaddtype/tests/test_quaddtype.py
@@ -3,7 +3,7 @@
 import numpy as np
 import operator
 
-from quaddtype import QuadPrecDType, QuadPrecision
+from numpy_quaddtype import QuadPrecDType, QuadPrecision
 
 
 def test_create_scalar_simple():
@@ -17,12 +17,6 @@ def test_basic_equality():
         "12.0") == QuadPrecision("12.00")
 
 
-@pytest.mark.parametrize("val", ["123532.543", "12893283.5"])
-def test_scalar_repr(val):
-    expected = f"QuadPrecision('{str(QuadPrecision(val))}')"
-    assert repr(QuadPrecision(val)) == expected
-
-
 @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "pow"])
 @pytest.mark.parametrize("other", ["3.0", "12.5", "100.0"])
 def test_binary_ops(op, other):