NVIDIA · kkraus14 · Jun 2, 2025 · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025
diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
@@ -22,6 +22,33 @@ ctypedef cpp_complex.complex[float] cpp_single_complex
 ctypedef cpp_complex.complex[double] cpp_double_complex
 
 
+# We need an identifier for fp16 for copying scalars on the host. This is a minimal
+# implementation borrowed from cuda_fp16.h.
+cdef extern from *:
+    """
+    #if __cplusplus >= 201103L
+    #define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+    #else
+    #if defined(__GNUC__)
+    #define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+    #elif defined(_MSC_VER)
+    #define __CUDA_ALIGN__(n) __declspec(align(n))
+    #else
+    #define __CUDA_ALIGN__(n)
+    #endif /* defined(__GNUC__) */
+    #endif /* __cplusplus >= 201103L */
+
+    typedef struct __CUDA_ALIGN__(2) {
+        /**
+         * Storage field contains bits representation of the \p half floating-point number.
+         */
+        unsigned short x;
+    } __half_raw;
+    """
+    ctypedef struct __half_raw:
+        unsigned short x
+
+
 ctypedef fused supported_type:
     cpp_bool
     int8_t
@@ -32,6 +59,7 @@ ctypedef fused supported_type:
     uint16_t
     uint32_t
     uint64_t
+    __half_raw
     float
     double
     intptr_t
@@ -85,6 +113,8 @@ cdef inline int prepare_arg(
         (<supported_type*>ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag)
     elif supported_type is cpp_double_complex:
         (<supported_type*>ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag)
+    elif supported_type is __half_raw:
+        (<supported_type*>ptr).x = <int16_t>(arg.view(numpy_int16))
     else:
         (<supported_type*>ptr)[0] = <supported_type>(arg)
     data_addresses[idx] = ptr  # take the address to the scalar
@@ -147,8 +177,7 @@ cdef inline int prepare_numpy_arg(
     elif isinstance(arg, numpy_uint64):
         return prepare_arg[uint64_t](data, data_addresses, arg, idx)
     elif isinstance(arg, numpy_float16):
-        # use int16 as a proxy
-        return prepare_arg[int16_t](data, data_addresses, arg, idx)
+        return prepare_arg[__half_raw](data, data_addresses, arg, idx)
     elif isinstance(arg, numpy_float32):
         return prepare_arg[float](data, data_addresses, arg, idx)
     elif isinstance(arg, numpy_float64):
@@ -207,7 +236,7 @@ cdef class ParamHolder:
                 not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i)
             if not_prepared:
                 # TODO: support ctypes/numpy struct
-                raise TypeError
+                raise TypeError("the argument is of unsupported type: " + str(type(arg)))
 
         self.kernel_args = kernel_args
         self.ptr = <intptr_t>self.data_addresses.data()

diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst
@@ -30,3 +30,4 @@ Fixes and enhancements
 ----------------------
 
 - An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
+- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -1,9 +1,15 @@
-# Copyright 2024 NVIDIA Corporation.  All rights reserved.
+# Copyright 2024-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import ctypes
+import os
+import pathlib
+
+import numpy as np
 import pytest
 
-from cuda.core.experimental import Device, LaunchConfig, Program, launch
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.experimental._memory import _DefaultPinnedMemorySource
 
 
 def test_launch_config_init(init_cuda):
@@ -59,3 +65,90 @@ def test_launch_invalid_values(init_cuda):
         launch(stream, ker, None)
 
     launch(stream, config, ker)
+
+
+# Parametrize: (python_type, cpp_type, init_value)
+PARAMS = (
+    (bool, "bool", True),
+    (float, "double", 2.718),
+    (np.bool, "bool", True),
+    (np.int8, "signed char", -42),
+    (np.int16, "signed short", -1234),
+    (np.int32, "signed int", -123456),
+    (np.int64, "signed long long", -123456789),
+    (np.uint8, "unsigned char", 42),
+    (np.uint16, "unsigned short", 1234),
+    (np.uint32, "unsigned int", 123456),
+    (np.uint64, "unsigned long long", 123456789),
+    (np.float32, "float", 3.14),
+    (np.float64, "double", 2.718),
+    (ctypes.c_bool, "bool", True),
+    (ctypes.c_int8, "signed char", -42),
+    (ctypes.c_int16, "signed short", -1234),
+    (ctypes.c_int32, "signed int", -123456),
+    (ctypes.c_int64, "signed long long", -123456789),
+    (ctypes.c_uint8, "unsigned char", 42),
+    (ctypes.c_uint16, "unsigned short", 1234),
+    (ctypes.c_uint32, "unsigned int", 123456),
+    (ctypes.c_uint64, "unsigned long long", 123456789),
+    (ctypes.c_float, "float", 3.14),
+    (ctypes.c_double, "double", 2.718),
+)
+if os.environ.get("CUDA_PATH"):
+    PARAMS += (
+        (np.float16, "half", 0.78),
+        (np.complex64, "cuda::std::complex<float>", 1 + 2j),
+        (np.complex128, "cuda::std::complex<double>", -3 - 4j),
+        (complex, "cuda::std::complex<double>", 5 - 7j),
+    )
+
+
+@pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS)
+@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
+def test_launch_scalar_argument(python_type, cpp_type, init_value):
+    dev = Device()
+    dev.set_current()
+
+    # Prepare pinned host array
+    mr = _DefaultPinnedMemorySource()
+    b = mr.allocate(np.dtype(python_type).itemsize)
+    arr = np.from_dlpack(b).view(python_type)
+    arr[:] = 0
+
+    # Prepare scalar argument in Python
+    scalar = python_type(init_value)
+
+    # CUDA kernel templated on type T
+    code = r"""
+    template <typename T>
+    __global__ void write_scalar(T* arr, T val) {
+        arr[0] = val;
+    }
+    """
+
+    # Compile and force instantiation for this type
+    arch = "".join(f"{i}" for i in dev.compute_capability)
+    if os.environ.get("CUDA_PATH"):
+        include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
+        code = (
+            r"""
+        #include <cuda_fp16.h>
+        #include <cuda/std/complex>
+        """
+            + code
+        )
+    else:
+        include_path = None
+    pro_opts = ProgramOptions(std="c++11", arch=f"sm_{arch}", include_path=include_path)
+    prog = Program(code, code_type="c++", options=pro_opts)
+    ker_name = f"write_scalar<{cpp_type}>"
+    mod = prog.compile("cubin", name_expressions=(ker_name,))
+    ker = mod.get_kernel(ker_name)
+
+    # Launch with 1 thread
+    config = LaunchConfig(grid=1, block=1)
+    launch(dev.default_stream, config, ker, arr.ctypes.data, scalar)
+    dev.default_stream.sync()
+
+    # Check result
+    assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,3 +30,4 @@ Fixes and enhancements
		----------------------

		- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
		- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed