Skip to content

Add tests to cover scalar handling in launch() + Fix fp16 bug #669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,33 @@ ctypedef cpp_complex.complex[float] cpp_single_complex
ctypedef cpp_complex.complex[double] cpp_double_complex


# We need an identifier for fp16 for copying scalars on the host. This is a minimal
# implementation borrowed from cuda_fp16.h.
cdef extern from *:
"""
#if __cplusplus >= 201103L
#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */
#else
#if defined(__GNUC__)
#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
#elif defined(_MSC_VER)
#define __CUDA_ALIGN__(n) __declspec(align(n))
#else
#define __CUDA_ALIGN__(n)
#endif /* defined(__GNUC__) */
#endif /* __cplusplus >= 201103L */

typedef struct __CUDA_ALIGN__(2) {
/**
* Storage field contains bits representation of the \p half floating-point number.
*/
unsigned short x;
} __half_raw;
"""
ctypedef struct __half_raw:
unsigned short x


ctypedef fused supported_type:
cpp_bool
int8_t
Expand All @@ -32,6 +59,7 @@ ctypedef fused supported_type:
uint16_t
uint32_t
uint64_t
__half_raw
float
double
intptr_t
Expand Down Expand Up @@ -85,6 +113,8 @@ cdef inline int prepare_arg(
(<supported_type*>ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag)
elif supported_type is cpp_double_complex:
(<supported_type*>ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag)
elif supported_type is __half_raw:
(<supported_type*>ptr).x = <int16_t>(arg.view(numpy_int16))
else:
(<supported_type*>ptr)[0] = <supported_type>(arg)
Comment on lines +116 to 119
Copy link
Member Author

@leofang leofang Jun 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the bug: When arg is a np.float16 scalar, the old code would treat it as int16_t (due to lack of standard C++ identifier for fp16 before C++23), and the scalar would be static_cast to int16_t, which triggered non-trivial conversion operators. The new code ensures that there is no conversion and the bytes are reinterpret_cast'd, and in order to hit this new path we need a unique type identifier, which is __half_raw.

data_addresses[idx] = ptr # take the address to the scalar
Expand Down Expand Up @@ -147,8 +177,7 @@ cdef inline int prepare_numpy_arg(
elif isinstance(arg, numpy_uint64):
return prepare_arg[uint64_t](data, data_addresses, arg, idx)
elif isinstance(arg, numpy_float16):
# use int16 as a proxy
return prepare_arg[int16_t](data, data_addresses, arg, idx)
return prepare_arg[__half_raw](data, data_addresses, arg, idx)
elif isinstance(arg, numpy_float32):
return prepare_arg[float](data, data_addresses, arg, idx)
elif isinstance(arg, numpy_float64):
Expand Down Expand Up @@ -207,7 +236,7 @@ cdef class ParamHolder:
not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i)
if not_prepared:
# TODO: support ctypes/numpy struct
raise TypeError
raise TypeError("the argument is of unsupported type: " + str(type(arg)))

self.kernel_args = kernel_args
self.ptr = <intptr_t>self.data_addresses.data()
Expand Down
1 change: 1 addition & 0 deletions cuda_core/docs/source/release/0.3.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ Fixes and enhancements
----------------------

- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed
97 changes: 95 additions & 2 deletions cuda_core/tests/test_launcher.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# Copyright 2024 NVIDIA Corporation. All rights reserved.
# Copyright 2024-2025 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import ctypes
import os
import pathlib

import numpy as np
import pytest

from cuda.core.experimental import Device, LaunchConfig, Program, launch
from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
from cuda.core.experimental._memory import _DefaultPinnedMemorySource


def test_launch_config_init(init_cuda):
Expand Down Expand Up @@ -59,3 +65,90 @@ def test_launch_invalid_values(init_cuda):
launch(stream, ker, None)

launch(stream, config, ker)


# Parametrize: (python_type, cpp_type, init_value)
PARAMS = (
(bool, "bool", True),
(float, "double", 2.718),
(np.bool, "bool", True),
(np.int8, "signed char", -42),
(np.int16, "signed short", -1234),
(np.int32, "signed int", -123456),
(np.int64, "signed long long", -123456789),
(np.uint8, "unsigned char", 42),
(np.uint16, "unsigned short", 1234),
(np.uint32, "unsigned int", 123456),
(np.uint64, "unsigned long long", 123456789),
(np.float32, "float", 3.14),
(np.float64, "double", 2.718),
(ctypes.c_bool, "bool", True),
(ctypes.c_int8, "signed char", -42),
(ctypes.c_int16, "signed short", -1234),
(ctypes.c_int32, "signed int", -123456),
(ctypes.c_int64, "signed long long", -123456789),
(ctypes.c_uint8, "unsigned char", 42),
(ctypes.c_uint16, "unsigned short", 1234),
(ctypes.c_uint32, "unsigned int", 123456),
(ctypes.c_uint64, "unsigned long long", 123456789),
(ctypes.c_float, "float", 3.14),
(ctypes.c_double, "double", 2.718),
)
if os.environ.get("CUDA_PATH"):
PARAMS += (
(np.float16, "half", 0.78),
(np.complex64, "cuda::std::complex<float>", 1 + 2j),
(np.complex128, "cuda::std::complex<double>", -3 - 4j),
(complex, "cuda::std::complex<double>", 5 - 7j),
)


@pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS)
@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
def test_launch_scalar_argument(python_type, cpp_type, init_value):
dev = Device()
dev.set_current()

# Prepare pinned host array
mr = _DefaultPinnedMemorySource()
b = mr.allocate(np.dtype(python_type).itemsize)
arr = np.from_dlpack(b).view(python_type)
arr[:] = 0

# Prepare scalar argument in Python
scalar = python_type(init_value)

# CUDA kernel templated on type T
code = r"""
template <typename T>
__global__ void write_scalar(T* arr, T val) {
arr[0] = val;
}
"""

# Compile and force instantiation for this type
arch = "".join(f"{i}" for i in dev.compute_capability)
if os.environ.get("CUDA_PATH"):
include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
code = (
r"""
#include <cuda_fp16.h>
#include <cuda/std/complex>
"""
+ code
)
else:
include_path = None
pro_opts = ProgramOptions(std="c++11", arch=f"sm_{arch}", include_path=include_path)
prog = Program(code, code_type="c++", options=pro_opts)
ker_name = f"write_scalar<{cpp_type}>"
mod = prog.compile("cubin", name_expressions=(ker_name,))
ker = mod.get_kernel(ker_name)

# Launch with 1 thread
config = LaunchConfig(grid=1, block=1)
launch(dev.default_stream, config, ker, arr.ctypes.data, scalar)
dev.default_stream.sync()

# Check result
assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}"
Loading