From 303f0dd2f6b82c2707647b36bec8a161ed514ed0 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Wed, 26 Feb 2025 19:56:27 -0700
Subject: [PATCH 1/2] add nvptx_target_feature

---
 crates/core_arch/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index 6c68a07211..c3b0abab45 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -24,6 +24,7 @@
     arm_target_feature,
     avx512_target_feature,
     mips_target_feature,
+    nvptx_target_feature,
     powerpc_target_feature,
     s390x_target_feature,
     loongarch_target_feature,

From f3a82aa82305ed7e54665cd337fc51f74f6de036 Mon Sep 17 00:00:00 2001
From: Jed Brown <jed@jedbrown.org>
Date: Fri, 21 Feb 2025 16:44:34 -0700
Subject: [PATCH 2/2] nvptx: fix _syncthreads to use unaligned barrier

* Deprecate _syncthreads (the CUDA name) in favor of new
_barrier_sync (NVPTX name barrier.sync).

* The: barrier.sync instruction is equivalent to barrier.sync.aligned
prior to sm_70, and will lead to errors/deadlock if passes (such as MIR
JumpThreading) lose the aligned property.
https://github.com/rust-lang/rust/issues/137086
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar

* Since: MIR does not currently have a way to apply something like
LLVM's convergent attribute (and because convergent does not preserve
alignment, which can be broken by inlining), we cannot prevent loss of
alignment, and thus we require target feature sm_70.

https://llvm.org/docs/ConvergentOperations.html
---
 crates/core_arch/src/nvptx/mod.rs | 48 +++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/src/nvptx/mod.rs b/crates/core_arch/src/nvptx/mod.rs
index 8d16dfb53d..6d0a946c1a 100644
--- a/crates/core_arch/src/nvptx/mod.rs
+++ b/crates/core_arch/src/nvptx/mod.rs
@@ -20,8 +20,8 @@ pub use packed::*;
 
 #[allow(improper_ctypes)]
 unsafe extern "C" {
-    #[link_name = "llvm.nvvm.barrier0"]
-    fn syncthreads() -> ();
+    #[link_name = "llvm.nvvm.barrier.sync"]
+    fn barrier_sync(_: u32) -> ();
     #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.x"]
     fn block_dim_x() -> i32;
     #[link_name = "llvm.nvvm.read.ptx.sreg.ntid.y"]
@@ -49,10 +49,52 @@ unsafe extern "C" {
 }
 
 /// Synchronizes all threads in the block.
+///
+/// The argument `a` is a logical barrier resource with value `0` through `15`.
+///
+/// This does not require textual alignment, so the following code is valid.
+///
+/// ```
+/// if tid % 2 == 0 {
+///     shared[tid] *= 2;
+///     _barrier_sync(0);
+///     myval += shared[tid + 1];
+/// } else {
+///     shared[tid] *= 4;
+///     _barrier_sync(0);
+/// }
+/// ```
+///
+/// This intrinsic has different execution semantics prior to `sm_70`, and thus
+/// it requires the `sm_70` target feature for correct behavior. The instruction
+/// was introduced in PTX 6.0, so its use has a compile-time dependency on the
+/// `ptx60` target feature.
+///
+/// TODO: The more restrictive "aligned" semantics of
+/// `llvm.nvvm.barrier.sync.aligned` are [currently
+/// miscompiled](https://github.com/rust-lang/rust/issues/137086) due to MIR
+/// JumpThreading and lack of `convergent` attribute propagated to LLVM. Once
+/// resolved, a `_barrier_sync_aligned` intrinsic can be exposed at all target
+/// features.
+///
+#[inline]
+#[cfg(target_feature = "ptx60")]
+#[target_feature(enable = "sm_70", enable = "ptx60")]
+#[unstable(feature = "stdarch_nvptx", issue = "111199")]
+pub unsafe fn _barrier_sync(a: u32) -> () {
+    barrier_sync(a)
+}
+
+/// Synchronizes all threads in the block.
+///
+/// Deprecated alias for [`_barrier_sync`].
 #[inline]
+#[cfg(target_feature = "ptx60")]
+#[target_feature(enable = "sm_70", enable = "ptx60")]
 #[unstable(feature = "stdarch_nvptx", issue = "111199")]
+#[deprecated(since = "1.88.0", note = "use _barrier_sync(0)")]
 pub unsafe fn _syncthreads() -> () {
-    syncthreads()
+    _barrier_sync(0)
 }
 
 /// x-th thread-block dimension.